library(tidyverse) # for everything
library(readxl) # for reading in excel files
library(janitor) # data checks and cleaning
library(glue) # for easy pasting
library(FactoMineR) # for PCA
library(factoextra) # for PCA
library(rstatix) # for stats
library(pheatmap) # for heatmaps
library(plotly) # for interactive plots
library(htmlwidgets) # for saving interactive plots
library(devtools)
library(notame) # used for feature clustering
library(doParallel)
library(igraph) # feature clustering
library(ggpubr) # visualizations
library(knitr) # clean table printing
library(rmarkdown)
library(mixOmics) # for multilevel PCAs
library(pathview) # for functional analysis and KEGG annotation
library(ggcorrplot)
library(corrr)
library(ggthemes)# rename "row ID"
omicsdata <- omicsdata %>%
rename("row_ID" = `row ID`)
# how many features
nrow(omicsdata)## [1] 1444
## # A tibble: 0 × 60
## # ℹ 60 variables: mz_rt <chr>, dupe_count <int>, row_ID <dbl>,
## # 6104_U2_HILICPOS_19 <dbl>, 6110_U2_HILICPOS_34 <dbl>,
## # 6108_U2_HILICPOS_27 <dbl>, 6111_U4_HILICPOS_42 <dbl>,
## # 6101_U3_HILICPOS_29 <dbl>, 6104_U1_HILICPOS_55 <dbl>,
## # 6103_U4_HILICPOS_53 <dbl>, 6110_U3_HILICPOS_45 <dbl>,
## # 6111_U2_HILICPOS_35 <dbl>, 6111_U1_HILICPOS_51 <dbl>,
## # 6105_U3_HILICPOS_39 <dbl>, 6109_U2_HILICPOS_13 <dbl>, …
# remove dupes
omicsdata <- omicsdata %>%
distinct(mz_rt, .keep_all = TRUE)
# check again for dupes
omicsdata %>% get_dupes(mz_rt)## # A tibble: 0 × 60
## # ℹ 60 variables: mz_rt <chr>, dupe_count <int>, row_ID <dbl>,
## # 6104_U2_HILICPOS_19 <dbl>, 6110_U2_HILICPOS_34 <dbl>,
## # 6108_U2_HILICPOS_27 <dbl>, 6111_U4_HILICPOS_42 <dbl>,
## # 6101_U3_HILICPOS_29 <dbl>, 6104_U1_HILICPOS_55 <dbl>,
## # 6103_U4_HILICPOS_53 <dbl>, 6110_U3_HILICPOS_45 <dbl>,
## # 6111_U2_HILICPOS_35 <dbl>, 6111_U1_HILICPOS_51 <dbl>,
## # 6105_U3_HILICPOS_39 <dbl>, 6109_U2_HILICPOS_13 <dbl>, …
## [1] 1444
Sometimes a weird logical column (lgl) comes up in my data. Let’s check if it’s there
## [1] "mz_rt" "row_ID"
## [3] "6104_U2_HILICPOS_19" "6110_U2_HILICPOS_34"
## [5] "6108_U2_HILICPOS_27" "6111_U4_HILICPOS_42"
## [7] "6101_U3_HILICPOS_29" "6104_U1_HILICPOS_55"
## [9] "6103_U4_HILICPOS_53" "6110_U3_HILICPOS_45"
## [11] "6111_U2_HILICPOS_35" "6111_U1_HILICPOS_51"
## [13] "6105_U3_HILICPOS_39" "6109_U2_HILICPOS_13"
## [15] "6108_U1_HILICPOS_44" "6109_U4_HILICPOS_22"
## [17] "6101_U4_HILICPOS_14" "6101_U2_HILICPOS_30"
## [19] "6110_U4_HILICPOS_10" "6105_U1_HILICPOS_43"
## [21] "6105_U4_HILICPOS_47" "6102_U1_HILICPOS_26"
## [23] "6102_U2_HILICPOS_16" "6104_U3_HILICPOS_32"
## [25] "6104_U4_HILICPOS_12" "6113_U3_HILICPOS_15"
## [27] "6113_U2_HILICPOS_31" "6109_U3_HILICPOS_52"
## [29] "6102_U4_HILICPOS_50" "6102_U3_HILICPOS_48"
## [31] "6108_U4_HILICPOS_18" "6106_U2_HILICPOS_46"
## [33] "6111_U3_HILICPOS_54" "6110_U1_HILICPOS_11"
## [35] "6108_U3_HILICPOS_28" "6106_U3_HILICPOS_20"
## [37] "6105_U2_HILICPOS_40" "6112_U2_HILICPOS_37"
## [39] "6103_U1_HILICPOS_21" "6113_U1_HILICPOS_24"
## [41] "6103_U3_HILICPOS_61" "6101_U1_HILICPOS_59"
## [43] "6112_U1_HILICPOS_38" "6109_U1_HILICPOS_58"
## [45] "6112_U3_HILICPOS_56" "6106_U4_HILICPOS_36"
## [47] "6103_U2_HILICPOS_60" "6106_U1_HILICPOS_23"
## [49] "6112_U4_HILICPOS_63" "PooledQC5_HILICPOS_11_1"
## [51] "6113_U4_HILICPOS_62" "PooledQC7_HILICPOS_25"
## [53] "PooledQC6_HILICPOS_17" "PooledQC9_HILICPOS_41"
## [55] "PooledQC10_HILICPOS_49" "PooledQC8_HILICPOS_33"
## [57] "PooledQC11_HILICPOS_57" "PooledQC12_HILICPOS_64"
## [59] "...59"
# remove weird lgl column
omicsdata <- omicsdata %>%
dplyr::select(!where(is.logical))
colnames(omicsdata)## [1] "mz_rt" "row_ID"
## [3] "6104_U2_HILICPOS_19" "6110_U2_HILICPOS_34"
## [5] "6108_U2_HILICPOS_27" "6111_U4_HILICPOS_42"
## [7] "6101_U3_HILICPOS_29" "6104_U1_HILICPOS_55"
## [9] "6103_U4_HILICPOS_53" "6110_U3_HILICPOS_45"
## [11] "6111_U2_HILICPOS_35" "6111_U1_HILICPOS_51"
## [13] "6105_U3_HILICPOS_39" "6109_U2_HILICPOS_13"
## [15] "6108_U1_HILICPOS_44" "6109_U4_HILICPOS_22"
## [17] "6101_U4_HILICPOS_14" "6101_U2_HILICPOS_30"
## [19] "6110_U4_HILICPOS_10" "6105_U1_HILICPOS_43"
## [21] "6105_U4_HILICPOS_47" "6102_U1_HILICPOS_26"
## [23] "6102_U2_HILICPOS_16" "6104_U3_HILICPOS_32"
## [25] "6104_U4_HILICPOS_12" "6113_U3_HILICPOS_15"
## [27] "6113_U2_HILICPOS_31" "6109_U3_HILICPOS_52"
## [29] "6102_U4_HILICPOS_50" "6102_U3_HILICPOS_48"
## [31] "6108_U4_HILICPOS_18" "6106_U2_HILICPOS_46"
## [33] "6111_U3_HILICPOS_54" "6110_U1_HILICPOS_11"
## [35] "6108_U3_HILICPOS_28" "6106_U3_HILICPOS_20"
## [37] "6105_U2_HILICPOS_40" "6112_U2_HILICPOS_37"
## [39] "6103_U1_HILICPOS_21" "6113_U1_HILICPOS_24"
## [41] "6103_U3_HILICPOS_61" "6101_U1_HILICPOS_59"
## [43] "6112_U1_HILICPOS_38" "6109_U1_HILICPOS_58"
## [45] "6112_U3_HILICPOS_56" "6106_U4_HILICPOS_36"
## [47] "6103_U2_HILICPOS_60" "6106_U1_HILICPOS_23"
## [49] "6112_U4_HILICPOS_63" "PooledQC5_HILICPOS_11_1"
## [51] "6113_U4_HILICPOS_62" "PooledQC7_HILICPOS_25"
## [53] "PooledQC6_HILICPOS_17" "PooledQC9_HILICPOS_41"
## [55] "PooledQC10_HILICPOS_49" "PooledQC8_HILICPOS_33"
## [57] "PooledQC11_HILICPOS_57" "PooledQC12_HILICPOS_64"
# create long df for omics df
omicsdata_tidy <- omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and omics dfs
df_combined <- full_join(omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
df_combined_sep <- df_combined %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
df_combined_sep$mz <- as.numeric(df_combined_sep$mz)
df_combined_sep$rt <- as.numeric(df_combined_sep$rt)
df_combined_sep$Subject <- as.character(df_combined_sep$Subject)
df_combined_sep$Intervention <- as.character(df_combined_sep$Intervention)
# rearrange column order
df_combined_sep <- df_combined_sep %>%
dplyr::select(sample_ID, pre_post, Intervention, everything())
str(df_combined_sep)## tibble [80,864 × 14] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:80864] "6104_U2_HILICPOS_19" "6110_U2_HILICPOS_34" "6108_U2_HILICPOS_27" "6111_U4_HILICPOS_42" ...
## $ pre_post : chr [1:80864] "post" "post" "post" "post" ...
## $ Intervention : chr [1:80864] "Yellow" "Yellow" "Red" "Red" ...
## $ mz : num [1:80864] 553 553 553 553 553 ...
## $ rt : num [1:80864] 0.421 0.421 0.421 0.421 0.421 0.421 0.421 0.421 0.421 0.421 ...
## $ row_ID : num [1:80864] 152 152 152 152 152 152 152 152 152 152 ...
## $ peak_height : num [1:80864] 3029 4240 2162 8723 3652 ...
## $ Subject : chr [1:80864] "6104" "6110" "6108" "6111" ...
## $ Period : chr [1:80864] "U2" "U2" "U2" "U4" ...
## $ sequence : chr [1:80864] "Y_R" "Y_R" "R_Y" "Y_R" ...
## $ Intervention_week: num [1:80864] 6 6 6 14 10 2 14 10 6 2 ...
## $ Sex : chr [1:80864] "M" "M" "M" "M" ...
## $ Age : num [1:80864] 54 36 68 46 58 54 60 36 46 46 ...
## $ BMI : num [1:80864] 33.1 29.9 31.8 30 31.1 ...
# plot
(plot_mzvsrt <- df_combined_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features"))# NAs in all data including QCs
NAbyRow <- rowSums(is.na(omicsdata[,-1]))
hist(NAbyRow,
breaks = 56, # because there are 56 samples, 48 samples + 8 QCs
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")# samples only (no QCs)
omicsdata_noQC <- omicsdata %>%
dplyr::select(-contains("QC"))
#NAs in samples only?
NAbyRow_noQC <- rowSums(is.na(omicsdata_noQC[,-1]))
hist(NAbyRow_noQC,
breaks = 48, # because there are 48 samples
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")Are there any missing values in QCs? There shouldn’t be after data preprocessing/filtering
omicsdata_QC <- omicsdata %>%
dplyr::select(starts_with("P"))
NAbyRow_QC <- colSums(is.na(omicsdata_QC))
# lets confirm that there are no missing values from my QCs
sum(NAbyRow_QC) # no## [1] 0
# calculate how many NAs there are per feature in whole data set
contains_NAs <- df_combined %>%
group_by(mz_rt) %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(contains_NAs)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 111.0438_5.483 TRUE 44
## 2 121.0283_5.484 TRUE 44
## 3 121.0283_5.936 TRUE 43
## 4 121.0285_7.146 TRUE 22
## 5 121.0684_7.945 TRUE 1
## 6 122.0634_4.935 TRUE 3
Update May 3, 2023: I’ve been seeing outliers in unsupervised analyses. So to handle this, I think it is best to filter out metabolites that are only present in one person. So I will remove metabolites that are missing from at least 44 samples. Taking this out for now.
# remove features that have 44 or more NAs
omit_features <- contains_NAs %>%
filter(n >= 44)
#preview
nrow(omit_features) # 109 features to remove
# how many features to remove?
nrow(omicsdata) - nrow(omit_features)
# now remove these features from the omics dataset
omicsdata <- omicsdata %>%
anti_join(omit_features,
by = "mz_rt")
# how many features are there now?
nrow(omicsdata)# impute any missing values by replacing them with 1/2 of the lowest peak height value of a feature (i.e. in a row).
imputed_omicsdata <- omicsdata
imputed_omicsdata[] <- lapply(imputed_omicsdata,
function(x) ifelse(is.na(x),
min(x, na.rm = TRUE)/2, x))
dim(imputed_omicsdata)## [1] 1444 58
Are there any NAs?
## [1] 0
# create long df for imputed omics df
imputed_omicsdata_tidy <- imputed_omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and imputed omics dfs
imputed_fulldata <- full_join(imputed_omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
imputed_fulldata_sep <- imputed_fulldata %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
imputed_fulldata_sep$mz <- as.numeric(imputed_fulldata_sep$mz)
imputed_fulldata_sep$rt <- as.numeric(imputed_fulldata_sep$rt)
imputed_fulldata_sep$Subject <- as.character(imputed_fulldata_sep$Subject)
imputed_fulldata_sep$Intervention <- as.character(imputed_fulldata_sep$Intervention)vignette for reference
# rt vs mz plot
imputed_fulldata_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "RT (min)",
y = "mz")# create features list from imputed data set to only include unique feature ID's (mz_rt), mz and RT
features <- imputed_fulldata_sep %>%
cbind(imputed_fulldata$mz_rt) %>%
rename("mz_rt" = "imputed_fulldata$mz_rt") %>%
dplyr::select(c(mz_rt, mz, rt)) %>%
distinct() # remove the duplicate rows
# create a second data frame which is just imputed_fulldata restructured to another wide format
data_notame <- data.frame(imputed_omicsdata %>%
dplyr::select(-row_ID) %>%
t())
data_notame <- data_notame %>%
tibble::rownames_to_column() %>% # change samples from rownames to its own column
row_to_names(row_number = 1) # change the feature IDs (mz_rt) from first row obs into column namesCheck structures
## 'data.frame': 1444 obs. of 3 variables:
## $ mz_rt: chr "553.3887_0.421" "666.637_0.431" "489.2284_0.496" "696.649_0.442" ...
## $ mz : num 553 667 489 697 297 ...
## $ rt : num 0.421 0.431 0.496 0.442 0.496 0.463 0.501 0.456 0.467 0.471 ...
## # A tibble: 1,444 × 3
## mz_rt mz rt
## <chr> <dbl> <dbl>
## 1 553.3887_0.421 553. 0.421
## 2 666.637_0.431 667. 0.431
## 3 489.2284_0.496 489. 0.496
## 4 696.649_0.442 697. 0.442
## 5 297.1938_0.496 297. 0.496
## 6 356.2792_0.463 356. 0.463
## 7 349.1768_0.501 349. 0.501
## 8 376.3184_0.456 376. 0.456
## 9 572.3994_0.467 572. 0.467
## 10 188.0705_0.471 188. 0.471
## # ℹ 1,434 more rows
## # A tibble: 56 × 1,445
## mz_rt `553.3887_0.421` `666.637_0.431` `489.2284_0.496` `696.649_0.442`
## <chr> <chr> <chr> <chr> <chr>
## 1 6104_U2_HI… " 3028.5269" " 20290.4630" " 1683.3690" " 4875.8610"
## 2 6110_U2_HI… " 4240.2890" " 501.9858" " 2050.1914" " 501.9858"
## 3 6108_U2_HI… " 2162.1748" " 505.4764" " 505.4764" " 1630.4479"
## 4 6111_U4_HI… " 8723.3040" " 1186.7206" " 1205.0459" " 1212.5492"
## 5 6101_U3_HI… " 3652.0537" " 1152.5675" " 500.2496" " 1005.7766"
## 6 6104_U1_HI… " 6700.4805" " 1479.3018" " 1635.1193" " 13722.4230"
## 7 6103_U4_HI… " 7693.8310" " 3270.3160" " 501.4131" " 501.4131"
## 8 6110_U3_HI… " 1657.042" " 500.043" " 1443.855" " 1172.850"
## 9 6111_U2_HI… " 7980.3076" " 1054.9924" " 1205.5146" " 1201.1527"
## 10 6111_U1_HI… " 4490.8410" " 1628.5261" " 2146.6333" " 1494.4916"
## # ℹ 46 more rows
## # ℹ 1,440 more variables: `297.1938_0.496` <chr>, `356.2792_0.463` <chr>,
## # `349.1768_0.501` <chr>, `376.3184_0.456` <chr>, `572.3994_0.467` <chr>,
## # `188.0705_0.471` <chr>, `342.2636_0.484` <chr>, `357.1666_0.506` <chr>,
## # `313.1894_0.512` <chr>, `324.2159_0.482` <chr>, `472.3114_0.479` <chr>,
## # `568.4552_0.447` <chr>, `169.0495_0.477` <chr>, `328.248_0.503` <chr>,
## # `456.3161_0.48` <chr>, `540.4236_0.454` <chr>, `292.2058_0.518` <chr>, …
# change to results to numeric
data_notame <- data_notame %>%
mutate_at(-1, as.numeric)
tibble(data_notame)## # A tibble: 56 × 1,445
## mz_rt `553.3887_0.421` `666.637_0.431` `489.2284_0.496` `696.649_0.442`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6104_U2_HI… 3029. 20290. 1683. 4876.
## 2 6110_U2_HI… 4240. 502. 2050. 502.
## 3 6108_U2_HI… 2162. 505. 505. 1630.
## 4 6111_U4_HI… 8723. 1187. 1205. 1213.
## 5 6101_U3_HI… 3652. 1153. 500. 1006.
## 6 6104_U1_HI… 6700. 1479. 1635. 13722.
## 7 6103_U4_HI… 7694. 3270. 501. 501.
## 8 6110_U3_HI… 1657. 500. 1444. 1173.
## 9 6111_U2_HI… 7980. 1055. 1206. 1201.
## 10 6111_U1_HI… 4491. 1629. 2147. 1494.
## # ℹ 46 more rows
## # ℹ 1,440 more variables: `297.1938_0.496` <dbl>, `356.2792_0.463` <dbl>,
## # `349.1768_0.501` <dbl>, `376.3184_0.456` <dbl>, `572.3994_0.467` <dbl>,
## # `188.0705_0.471` <dbl>, `342.2636_0.484` <dbl>, `357.1666_0.506` <dbl>,
## # `313.1894_0.512` <dbl>, `324.2159_0.482` <dbl>, `472.3114_0.479` <dbl>,
## # `568.4552_0.447` <dbl>, `169.0495_0.477` <dbl>, `328.248_0.503` <dbl>,
## # `456.3161_0.48` <dbl>, `540.4236_0.454` <dbl>, `292.2058_0.518` <dbl>, …
connection <- find_connections(data = data_notame,
features = features,
corr_thresh = 0.9,
rt_window = 1/60,
name_col = "mz_rt",
mz_col = "mz",
rt_col = "rt")## [1] 100
## [1] 200
## [1] 300
## [1] 400
## [1] 500
## [1] 600
## [1] 700
## [1] 800
## [1] 900
## [1] 1000
## [1] 1100
## [1] 1200
## [1] 1300
## [1] 1400
## x y cor rt_diff mz_diff
## 1 489.2284_0.496 644.3586_0.492 0.9693076 -0.004 155.1302
## 2 572.3994_0.467 456.3161_0.48 0.9617216 0.013 -116.0833
## 3 313.1894_0.512 270.1866_0.504 0.9164155 -0.008 -43.0028
## 4 313.1894_0.512 244.1663_0.521 0.9075760 0.009 -69.0231
## 5 313.1894_0.512 522.2943_0.527 0.9014331 0.015 209.1049
## 6 568.4552_0.447 540.4236_0.454 0.9884175 0.007 -28.0316
## 167 components found
##
## Component 100 / 167
## 50 components found
##
## 12 components found
##
## 3 components found
# assign a cluster ID to all features. Clusters are named after feature with highest median peak height
features_clustered <- assign_cluster_id(data_notame, clusters, features, name_col = "mz_rt")
# visualize clusters
#visualize_clusters(data_notame, features, clusters, min_size = 3, rt_window = 2,name_col = "mz_rt", mz_col = "mz", rt_col = "rt", file_path = "~/path/to/project/")
# lets see how many features are removed when we only keep one feature per cluster
pulled <- pull_clusters(data_notame, features_clustered, name_col = "mz_rt")
cluster_data <- pulled$cdata
cluster_features <- pulled$cfeatures
# export clustered feature list
write_csv(cluster_features,
"cluster_features-_hilic-pos.csv")
nrow(omicsdata) - nrow(cluster_features)## [1] 363
# transpose the full dataset back to wide so that it is more similar to the notame dataset
imputed_fulldata_wide <- imputed_fulldata %>%
dplyr::select(-"row_ID") %>%
pivot_wider(names_from = mz_rt,
values_from = peak_height)
# list of reduced features
clusternames <- cluster_features$mz_rt
# dplyr::select only the features are in the reduced list
imp_clust <- imputed_fulldata_wide[,c(names(imputed_fulldata_wide) %in% clusternames)]
# bind back sample names
imp_clust <- cbind(imputed_fulldata_wide[1], imp_clust)
tibble(imp_clust)## # A tibble: 56 × 1,082
## sample_ID `553.3887_0.421` `666.637_0.431` `489.2284_0.496` `696.649_0.442`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6104_U2_HI… 3029. 20290. 1683. 4876.
## 2 6110_U2_HI… 4240. 502. 2050. 502.
## 3 6108_U2_HI… 2162. 505. 505. 1630.
## 4 6111_U4_HI… 8723. 1187. 1205. 1213.
## 5 6101_U3_HI… 3652. 1153. 500. 1006.
## 6 6104_U1_HI… 6700. 1479. 1635. 13722.
## 7 6103_U4_HI… 7694. 3270. 501. 501.
## 8 6110_U3_HI… 1657. 500. 1444. 1173.
## 9 6111_U2_HI… 7980. 1055. 1206. 1201.
## 10 6111_U1_HI… 4491. 1629. 2147. 1494.
## # ℹ 46 more rows
## # ℹ 1,077 more variables: `297.1938_0.496` <dbl>, `356.2792_0.463` <dbl>,
## # `349.1768_0.501` <dbl>, `376.3184_0.456` <dbl>, `572.3994_0.467` <dbl>,
## # `188.0705_0.471` <dbl>, `342.2636_0.484` <dbl>, `357.1666_0.506` <dbl>,
## # `313.1894_0.512` <dbl>, `324.2159_0.482` <dbl>, `472.3114_0.479` <dbl>,
## # `169.0495_0.477` <dbl>, `328.248_0.503` <dbl>, `540.4236_0.454` <dbl>,
## # `269.1382_0.482` <dbl>, `339.1871_0.5` <dbl>, `316.2478_0.506` <dbl>, …
# plot new rt vs mz scatterplot post-clustering
(plot_mzvsrt_postcluster <- cluster_features %>%
ggplot(aes(x = rt,
y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features after clustering"))# plot both scatterplots to compare with and without notame clustering
(scatterplots <- ggarrange(plot_mzvsrt,
plot_mzvsrt_postcluster,
nrow = 2))# change meta data columns to character so that I can change NAs from QCs to "QC"
imp_metabind_clust <- imp_metabind_clust %>%
mutate_at(c("Subject",
"Period",
"Intervention",
"pre_post",
"sequence",
"Intervention_week",
"Sex",
"Age",
"BMI"),
as.character)
# replace NAs in metadata columns for QCs
imp_metabind_clust[is.na(imp_metabind_clust)] <- "QC"
# unite pre_post column with intervention column to create pre_intervention column
imp_metabind_clust <- imp_metabind_clust %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)
# long df
imp_metabind_clust_tidy <- imp_metabind_clust %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund")
# structure check
str(imp_metabind_clust_tidy)## tibble [60,536 × 13] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:60536] "6101_U1_HILICPOS_59" "6101_U1_HILICPOS_59" "6101_U1_HILICPOS_59" "6101_U1_HILICPOS_59" ...
## $ Subject : chr [1:60536] "6101" "6101" "6101" "6101" ...
## $ Period : chr [1:60536] "U1" "U1" "U1" "U1" ...
## $ pre_post_intervention: chr [1:60536] "pre_Red" "pre_Red" "pre_Red" "pre_Red" ...
## $ Intervention : chr [1:60536] "Red" "Red" "Red" "Red" ...
## $ pre_post : chr [1:60536] "pre" "pre" "pre" "pre" ...
## $ sequence : chr [1:60536] "R_Y" "R_Y" "R_Y" "R_Y" ...
## $ Intervention_week : chr [1:60536] "2" "2" "2" "2" ...
## $ Sex : chr [1:60536] "F" "F" "F" "F" ...
## $ Age : chr [1:60536] "58" "58" "58" "58" ...
## $ BMI : chr [1:60536] "31.0556029483653" "31.0556029483653" "31.0556029483653" "31.0556029483653" ...
## $ mz_rt : chr [1:60536] "553.3887_0.421" "666.637_0.431" "489.2284_0.496" "696.649_0.442" ...
## $ rel_abund : num [1:60536] 4290 5838 503 45615 2919 ...
imp_metabind_clust_tidy %>%
ggplot(aes(x = sample_ID, y = rel_abund, color = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_color_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Unscaled data",
y = "Relative abundance")
Will need to log transform in order to normalize and actually see the
data
imp_metabind_clust_tidy_log2 <- imp_metabind_clust_tidy %>%
mutate(rel_abund_log2 = if_else(rel_abund > 0, log2(rel_abund), 0)) %>%
replace(is.na(.), 0)(bp_data_quality <- imp_metabind_clust_tidy_log2 %>%
ggplot(aes(x = sample_ID, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_fill_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Log2 transformed data",
y = "Relative abundance"))Much better! QCs look like there may be some drift though. Let’s do drift correction with the Notame package!
# filtered and imputed data after notame clustering, transposed
features_testforQCcorr <- t(imp_clust) %>%
as.data.frame() %>%
row_to_names(row_number = "find_header")
# log2 transform
log2_features_testforQCcorr <- features_testforQCcorr %>%
mutate_all(as.numeric) %>%
log2()# write csv to manually edit
write.csv(log2_features_testforQCcorr,
"notame dfs/feaures_test.csv",
row.names = TRUE)Import corrected df (edited so that mz_rt could rowname 1)
# separate sampleID and injection order
pheno_data <- imp_clust[1] %>%
separate(col = sample_ID,
into = c("sample_ID", "injection_order"),
sep = "_HILICPOS_")
# change pre-run QC injection number to correct number
pheno_data[48, "injection_order"] <- 9
# make inj order column num
pheno_data <- pheno_data %>%
mutate_at("injection_order", as.numeric)
t_pheno_data <- as.data.frame(t(pheno_data))Combine pheno and feature dfs manually in excel to create metaboset df.
#make sure when converting csv to xlsx that you save as a new file, don't just change the name of the file
metaboset <- read_from_excel("notame dfs/metaboset.xlsx",
split_by = c("column", "Ion mode"))## INFO [2024-05-22 16:49:14] Detecting corner position
## INFO [2024-05-22 16:49:14] Corner detected correctly at row 3, column D
## INFO [2024-05-22 16:49:14]
## Extracting sample information from rows 1 to 3 and columns E to BH
## INFO [2024-05-22 16:49:14] Replacing spaces in sample information column names with underscores (_)
## INFO [2024-05-22 16:49:14] Naming the last column of sample information "Datafile"
## INFO [2024-05-22 16:49:14]
## Extracting feature information from rows 4 to 1084 and columns A to D
## INFO [2024-05-22 16:49:14] Creating Split column from column, Ion mode
## INFO [2024-05-22 16:49:14] Feature_ID column not found, creating feature IDs
## INFO [2024-05-22 16:49:14] Identified m/z column mass and retention time column rt
## INFO [2024-05-22 16:49:14] Creating feature IDs from Split, m/z and retention time
## INFO [2024-05-22 16:49:14] Replacing dots (.) in feature information column names with underscores (_)
## INFO [2024-05-22 16:49:14]
## Extracting feature abundances from rows 4 to 1084 and columns E to BH
## INFO [2024-05-22 16:49:14]
## Checking sample information
## INFO [2024-05-22 16:49:14] QC column generated from rows containing 'QC'
## INFO [2024-05-22 16:49:14] Sample ID autogenerated from injection orders and prefix ID_
## INFO [2024-05-22 16:49:14] Checking that feature abundances only contain numeric values
## INFO [2024-05-22 16:49:14]
## Checking feature information
## INFO [2024-05-22 16:49:14] Checking that feature IDs are unique and not stored as numbers
## INFO [2024-05-22 16:49:14] Checking that m/z and retention time values are reasonable
#construct Metaboset
modes <- construct_metabosets(exprs = metaboset$exprs,
pheno_data = metaboset$pheno_data,
feature_data = metaboset$feature_data, group_col = "Class")## Initializing the object(s) with unflagged features
## INFO [2024-05-22 16:49:14]
## Checking feature information
## INFO [2024-05-22 16:49:14] Checking that feature IDs are unique and not stored as numbers
## INFO [2024-05-22 16:49:14] Checking that feature abundances only contain numeric values
## INFO [2024-05-22 16:49:14] Setting row and column names of exprs based on feature and pheno data
(qualityBPs_b4correction <- plot_sample_boxplots(mode, order_by = "Class", title = "Uncorrected feature abundance"))drift corrected takes up to 2 minutes
## INFO [2024-05-22 16:49:18]
## 0% of features flagged for low detection rate
## INFO [2024-05-22 16:49:18]
## Starting drift correction at 2024-05-22 16:49:18.962368
## INFO [2024-05-22 16:49:23] Drift correction performed at 2024-05-22 16:49:23.533239
## INFO [2024-05-22 16:49:25] Inspecting drift correction results 2024-05-22 16:49:25.29982
## INFO [2024-05-22 16:49:28] Drift correction results inspected at 2024-05-22 16:49:28.312032
## INFO [2024-05-22 16:49:28]
## Drift correction results inspected, report:
## Drift_corrected: 100%
inspection also takes about 2 minutes to run; output is percent of the features that were drift corrected. The remaining “low-quality” percent represents features for which the DC did not improve the RSD and D-ratio of the original data.
## INFO [2024-05-22 16:49:29] Inspecting drift correction results 2024-05-22 16:49:29.821425
## INFO [2024-05-22 16:49:39] Drift correction results inspected at 2024-05-22 16:49:39.792866
## INFO [2024-05-22 16:49:39]
## Drift correction results inspected, report:
## Drift_corrected: 74%, Low_quality: 26%
(qualityBPs_compared <- ggarrange(qualityBPs_b4correction, qualityBPS_driftcorrection,
ncol = 1, nrow = 2))metabdata_corrected_MZ_RT <- metabdata_corrected %>%
mutate(mass = round(metabdata_corrected$mass, digits = 4), # Decrease number of decimals for m/z & rt
rt = round(metabdata_corrected$rt, digits = 3),
.before=1,
.keep="unused") %>%
unite(mz_rt, c(mass, rt), remove=TRUE) # Combine m/z & rt with _ in betweenI want the new drift corrected (DC) df to look just like “imp_metabind_clust_log2” df
# go back to wide data
imp_metabind_clust_log2 <- imp_metabind_clust_tidy_log2 %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2)# combine subject and period columns from imp_metabind_clust_log2 in order to mimic DC df
subj_per_imp_metabind_clust_log2 <- imp_metabind_clust_log2 %>%
unite(subj_period, c(Subject, Period), remove = FALSE)
# place new DC observations in
DC_imp_metabind_clust_log2 <- full_join(subj_per_imp_metabind_clust_log2[,c(1:12)], metabdata_corrected_t, by = "subj_period")
# take out old QC observations
DC_imp_metabind_clust_log2 <- DC_imp_metabind_clust_log2 %>%
filter(subj_period != "QC_QC")
# replace NAs in columns for QCs
DC_imp_metabind_clust_log2[is.na(DC_imp_metabind_clust_log2)] <- "QC"Data after drift correction
DC_imp_metabind_clust_log2 <- DC_imp_metabind_clust_log2 %>%
dplyr::select(!"subj_period") %>%
mutate_at(-c(1:11), as.numeric)PCA.DC_imp_metabind_clust_log2 <- PCA(DC_imp_metabind_clust_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
kable(summary(PCA.DC_imp_metabind_clust_log2))##
## Call:
## PCA(X = DC_imp_metabind_clust_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 740.319 277.768 170.943 111.555 103.715 76.395 71.350
## % of var. 34.724 13.028 8.018 5.232 4.865 3.583 3.347
## Cumulative % of var. 34.724 47.752 55.770 61.002 65.867 69.450 72.797
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 54.362 46.711 43.259 42.936 34.019 32.825 24.354
## % of var. 2.550 2.191 2.029 2.014 1.596 1.540 1.142
## Cumulative % of var. 75.347 77.537 79.566 81.580 83.176 84.716 85.858
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 23.286 19.938 19.341 17.074 15.831 13.860 13.390
## % of var. 1.092 0.935 0.907 0.801 0.743 0.650 0.628
## Cumulative % of var. 86.950 87.885 88.792 89.593 90.336 90.986 91.614
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 13.061 12.009 11.811 10.464 9.839 8.905 8.619
## % of var. 0.613 0.563 0.554 0.491 0.461 0.418 0.404
## Cumulative % of var. 92.226 92.790 93.344 93.834 94.296 94.714 95.118
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 7.911 7.585 7.007 6.857 6.548 5.948 5.696
## % of var. 0.371 0.356 0.329 0.322 0.307 0.279 0.267
## Cumulative % of var. 95.489 95.845 96.173 96.495 96.802 97.081 97.348
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 5.555 5.404 5.080 4.891 4.629 4.614 4.341
## % of var. 0.261 0.253 0.238 0.229 0.217 0.216 0.204
## Cumulative % of var. 97.609 97.862 98.101 98.330 98.547 98.764 98.967
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48 Dim.49
## Variance 3.933 3.790 3.463 3.328 2.998 2.653 0.425
## % of var. 0.184 0.178 0.162 0.156 0.141 0.124 0.020
## Cumulative % of var. 99.152 99.329 99.492 99.648 99.789 99.913 99.933
## Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55
## Variance 0.407 0.341 0.320 0.259 0.103 0.000
## % of var. 0.019 0.016 0.015 0.012 0.005 0.000
## Cumulative % of var. 99.952 99.968 99.983 99.995 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2
## 1 | 36.227 | -21.292 1.094 0.345 | -4.578
## 2 | 32.026 | -19.203 0.890 0.360 | -4.873
## 3 | 41.195 | -11.622 0.326 0.080 | -7.221
## 4 | 36.311 | -21.504 1.115 0.351 | -3.138
## 5 | 37.947 | -19.379 0.906 0.261 | -2.619
## 6 | 72.463 | 56.513 7.703 0.608 | -32.698
## 7 | 42.209 | -17.833 0.767 0.179 | -2.702
## 8 | 37.900 | -12.342 0.367 0.106 | -7.527
## 9 | 33.967 | -13.714 0.454 0.163 | -7.331
## 10 | 38.441 | -12.606 0.383 0.108 | -0.274
## ctr cos2 Dim.3 ctr cos2
## 1 0.135 0.016 | 4.134 0.179 0.013 |
## 2 0.153 0.023 | -2.010 0.042 0.004 |
## 3 0.335 0.031 | -1.279 0.017 0.001 |
## 4 0.063 0.007 | 2.512 0.066 0.005 |
## 5 0.044 0.005 | -1.708 0.030 0.002 |
## 6 6.873 0.204 | -10.672 1.190 0.022 |
## 7 0.047 0.004 | -0.190 0.000 0.000 |
## 8 0.364 0.039 | -4.555 0.217 0.014 |
## 9 0.346 0.047 | 5.926 0.367 0.030 |
## 10 0.000 0.000 | -5.964 0.372 0.024 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2
## 553.3887_0.421 | 0.274 0.010 0.046 | 0.483 0.084 0.144 |
## 666.637_0.431 | 0.199 0.005 0.044 | 0.070 0.002 0.005 |
## 489.2284_0.496 | 1.344 0.244 0.321 | 1.525 0.837 0.413 |
## 696.649_0.442 | 0.096 0.001 0.005 | 0.274 0.027 0.039 |
## 297.1938_0.496 | 1.715 0.397 0.886 | 0.003 0.000 0.000 |
## 356.2792_0.463 | -0.107 0.002 0.008 | 0.408 0.060 0.119 |
## 349.1768_0.501 | 0.080 0.001 0.006 | -0.051 0.001 0.003 |
## 376.3184_0.456 | -0.342 0.016 0.056 | 0.321 0.037 0.050 |
## 572.3994_0.467 | -0.137 0.003 0.029 | 0.076 0.002 0.009 |
## 188.0705_0.471 | -0.056 0.000 0.002 | 0.418 0.063 0.102 |
## Dim.3 ctr cos2
## 553.3887_0.421 -0.379 0.084 0.089 |
## 666.637_0.431 -0.075 0.003 0.006 |
## 489.2284_0.496 0.694 0.281 0.085 |
## 696.649_0.442 -0.366 0.078 0.069 |
## 297.1938_0.496 0.301 0.053 0.027 |
## 356.2792_0.463 0.115 0.008 0.010 |
## 349.1768_0.501 0.095 0.005 0.009 |
## 376.3184_0.456 -0.121 0.009 0.007 |
## 572.3994_0.467 0.229 0.031 0.081 |
## 188.0705_0.471 -0.034 0.001 0.001 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2
## sample_ID_6101_U1_HILICPOS_59 | 36.227 | -21.292 0.345 -0.783 | -4.578
## sample_ID_6101_U2_HILICPOS_30 | 37.795 | -16.899 0.200 -0.621 | -4.135
## sample_ID_6101_U3_HILICPOS_29 | 33.252 | -19.751 0.353 -0.726 | -8.982
## sample_ID_6101_U4_HILICPOS_14 | 35.958 | -20.050 0.311 -0.737 | -4.743
## sample_ID_6102_U1_HILICPOS_26 | 32.026 | -19.203 0.360 -0.706 | -4.873
## sample_ID_6102_U2_HILICPOS_16 | 34.164 | -17.005 0.248 -0.625 | -6.209
## sample_ID_6102_U3_HILICPOS_48 | 40.272 | -16.865 0.175 -0.620 | -1.346
## sample_ID_6102_U4_HILICPOS_50 | 33.636 | -17.520 0.271 -0.644 | -3.408
## sample_ID_6103_U1_HILICPOS_21 | 41.195 | -11.622 0.080 -0.427 | -7.221
## sample_ID_6103_U2_HILICPOS_60 | 38.193 | -13.993 0.134 -0.514 | -5.251
## cos2 v.test Dim.3 cos2 v.test
## sample_ID_6101_U1_HILICPOS_59 0.016 -0.275 | 4.134 0.013 0.316 |
## sample_ID_6101_U2_HILICPOS_30 0.012 -0.248 | 2.666 0.005 0.204 |
## sample_ID_6101_U3_HILICPOS_29 0.073 -0.539 | 3.135 0.009 0.240 |
## sample_ID_6101_U4_HILICPOS_14 0.017 -0.285 | -1.171 0.001 -0.090 |
## sample_ID_6102_U1_HILICPOS_26 0.023 -0.292 | -2.010 0.004 -0.154 |
## sample_ID_6102_U2_HILICPOS_16 0.033 -0.373 | -1.406 0.002 -0.108 |
## sample_ID_6102_U3_HILICPOS_48 0.001 -0.081 | 1.705 0.002 0.130 |
## sample_ID_6102_U4_HILICPOS_50 0.010 -0.204 | -4.219 0.016 -0.323 |
## sample_ID_6103_U1_HILICPOS_21 0.031 -0.433 | -1.279 0.001 -0.098 |
## sample_ID_6103_U2_HILICPOS_60 0.019 -0.315 | 2.224 0.003 0.170 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sample_ID_6101_U1_HILICPOS_59 | | | 36.227 | | | -21.292 | 0.345 | -0.783 | | | -4.578 | 0.016 | -0.275 | | | 4.134 | 0.013 | 0.316 | | |
| sample_ID_6101_U2_HILICPOS_30 | | | 37.795 | | | -16.899 | 0.200 | -0.621 | | | -4.135 | 0.012 | -0.248 | | | 2.666 | 0.005 | 0.204 | | |
| sample_ID_6101_U3_HILICPOS_29 | | | 33.252 | | | -19.751 | 0.353 | -0.726 | | | -8.982 | 0.073 | -0.539 | | | 3.135 | 0.009 | 0.240 | | |
| sample_ID_6101_U4_HILICPOS_14 | | | 35.958 | | | -20.050 | 0.311 | -0.737 | | | -4.743 | 0.017 | -0.285 | | | -1.171 | 0.001 | -0.090 | | |
| sample_ID_6102_U1_HILICPOS_26 | | | 32.026 | | | -19.203 | 0.360 | -0.706 | | | -4.873 | 0.023 | -0.292 | | | -2.010 | 0.004 | -0.154 | | |
| sample_ID_6102_U2_HILICPOS_16 | | | 34.164 | | | -17.005 | 0.248 | -0.625 | | | -6.209 | 0.033 | -0.373 | | | -1.406 | 0.002 | -0.108 | | |
| sample_ID_6102_U3_HILICPOS_48 | | | 40.272 | | | -16.865 | 0.175 | -0.620 | | | -1.346 | 0.001 | -0.081 | | | 1.705 | 0.002 | 0.130 | | |
| sample_ID_6102_U4_HILICPOS_50 | | | 33.636 | | | -17.520 | 0.271 | -0.644 | | | -3.408 | 0.010 | -0.204 | | | -4.219 | 0.016 | -0.323 | | |
| sample_ID_6103_U1_HILICPOS_21 | | | 41.195 | | | -11.622 | 0.080 | -0.427 | | | -7.221 | 0.031 | -0.433 | | | -1.279 | 0.001 | -0.098 | | |
| sample_ID_6103_U2_HILICPOS_60 | | | 38.193 | | | -13.993 | 0.134 | -0.514 | | | -5.251 | 0.019 | -0.315 | | | 2.224 | 0.003 | 0.170 | | |
# pull PC coordinates into df
PC_coord_QC_log2 <- as.data.frame(PCA.DC_imp_metabind_clust_log2$ind$coord)
# bind back metadata from cols 1-10
PC_coord_QC_log2 <- bind_cols(DC_imp_metabind_clust_log2[,1:11], PC_coord_QC_log2)
# grab some variance explained
importance_QC <- PCA.DC_imp_metabind_clust_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_withQC <- round(importance_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_withQC <- round(importance_QC[2,2], 2)Using FactoExtra package
| eigenvalue | variance.percent | cumulative.variance.percent | |
|---|---|---|---|
| Dim.1 | 740.3189051 | 34.7237156 | 34.72372 |
| Dim.2 | 277.7684601 | 13.0283760 | 47.75209 |
| Dim.3 | 170.9426797 | 8.0178487 | 55.76994 |
| Dim.4 | 111.5554061 | 5.2323643 | 61.00230 |
| Dim.5 | 103.7152804 | 4.8646332 | 65.86694 |
| Dim.6 | 76.3952643 | 3.5832226 | 69.45016 |
| Dim.7 | 71.3504685 | 3.3466029 | 72.79676 |
| Dim.8 | 54.3616737 | 2.5497651 | 75.34653 |
| Dim.9 | 46.7113400 | 2.1909359 | 77.53746 |
| Dim.10 | 43.2590916 | 2.0290126 | 79.56648 |
| Dim.11 | 42.9356262 | 2.0138409 | 81.58032 |
| Dim.12 | 34.0186296 | 1.5956005 | 83.17592 |
| Dim.13 | 32.8249413 | 1.5396121 | 84.71553 |
| Dim.14 | 24.3537740 | 1.1422828 | 85.85781 |
| Dim.15 | 23.2861743 | 1.0922084 | 86.95002 |
| Dim.16 | 19.9375981 | 0.9351477 | 87.88517 |
| Dim.17 | 19.3408314 | 0.9071571 | 88.79233 |
| Dim.18 | 17.0738063 | 0.8008251 | 89.59315 |
| Dim.19 | 15.8305812 | 0.7425133 | 90.33566 |
| Dim.20 | 13.8599354 | 0.6500826 | 90.98575 |
| Dim.21 | 13.3903408 | 0.6280569 | 91.61380 |
| Dim.22 | 13.0609707 | 0.6126082 | 92.22641 |
| Dim.23 | 12.0090727 | 0.5632703 | 92.78968 |
| Dim.24 | 11.8106898 | 0.5539654 | 93.34365 |
| Dim.25 | 10.4636200 | 0.4907828 | 93.83443 |
| Dim.26 | 9.8390777 | 0.4614894 | 94.29592 |
| Dim.27 | 8.9053376 | 0.4176935 | 94.71361 |
| Dim.28 | 8.6193822 | 0.4042811 | 95.11789 |
| Dim.29 | 7.9110564 | 0.3710580 | 95.48895 |
| Dim.30 | 7.5854166 | 0.3557843 | 95.84474 |
| Dim.31 | 7.0065218 | 0.3286320 | 96.17337 |
| Dim.32 | 6.8569526 | 0.3216166 | 96.49499 |
| Dim.33 | 6.5478600 | 0.3071190 | 96.80210 |
| Dim.34 | 5.9481472 | 0.2789903 | 97.08110 |
| Dim.35 | 5.6956475 | 0.2671471 | 97.34824 |
| Dim.36 | 5.5550901 | 0.2605544 | 97.60880 |
| Dim.37 | 5.4044490 | 0.2534888 | 97.86229 |
| Dim.38 | 5.0799936 | 0.2382706 | 98.10056 |
| Dim.39 | 4.8911355 | 0.2294125 | 98.32997 |
| Dim.40 | 4.6293744 | 0.2171349 | 98.54710 |
| Dim.41 | 4.6142178 | 0.2164240 | 98.76353 |
| Dim.42 | 4.3408372 | 0.2036014 | 98.96713 |
| Dim.43 | 3.9332019 | 0.1844818 | 99.15161 |
| Dim.44 | 3.7895494 | 0.1777440 | 99.32935 |
| Dim.45 | 3.4627969 | 0.1624181 | 99.49177 |
| Dim.46 | 3.3282705 | 0.1561083 | 99.64788 |
| Dim.47 | 2.9981728 | 0.1406255 | 99.78851 |
| Dim.48 | 2.6529236 | 0.1244320 | 99.91294 |
| Dim.49 | 0.4254977 | 0.0199574 | 99.93290 |
| Dim.50 | 0.4073314 | 0.0191054 | 99.95200 |
| Dim.51 | 0.3413914 | 0.0160125 | 99.96801 |
| Dim.52 | 0.3201818 | 0.0150177 | 99.98303 |
| Dim.53 | 0.2590048 | 0.0121483 | 99.99518 |
| Dim.54 | 0.1027576 | 0.0048197 | 100.00000 |
| Dim.55 | 0.0000092 | 0.0000004 | 100.00000 |
# manual scores plot
(PCA_withQCs <- PC_coord_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_withQC/PC1_withQC) +
labs(x = glue::glue("PC1: {PC1_withQC}%"),
y = glue::glue("PC2: {PC2_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data"))DC_imp_metabind_clust_log2_noQCs <- DC_imp_metabind_clust_log2 %>%
filter(Intervention != "QC")
PCA.DC_imp_metabind_clust_log2_noQCs <- PCA(DC_imp_metabind_clust_log2_noQCs, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.DC_imp_metabind_clust_log2_noQCs))##
## Call:
## PCA(X = DC_imp_metabind_clust_log2_noQCs, scale.unit = FALSE,
## quali.sup = 1:11, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 585.594 249.121 131.532 129.672 89.127 84.013 65.178
## % of var. 29.946 12.739 6.726 6.631 4.558 4.296 3.333
## Cumulative % of var. 29.946 42.685 49.412 56.043 60.600 64.897 68.230
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 54.491 50.986 50.334 42.612 38.301 28.482 27.486
## % of var. 2.787 2.607 2.574 2.179 1.959 1.456 1.406
## Cumulative % of var. 71.016 73.623 76.197 78.377 80.335 81.792 83.197
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 24.389 22.813 21.557 19.017 16.417 15.714 15.582
## % of var. 1.247 1.167 1.102 0.972 0.840 0.804 0.797
## Cumulative % of var. 84.444 85.611 86.713 87.686 88.525 89.329 90.126
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 14.288 13.802 12.354 11.492 10.685 10.090 9.402
## % of var. 0.731 0.706 0.632 0.588 0.546 0.516 0.481
## Cumulative % of var. 90.857 91.562 92.194 92.782 93.328 93.844 94.325
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 9.011 8.173 8.044 7.654 6.944 6.716 6.637
## % of var. 0.461 0.418 0.411 0.391 0.355 0.343 0.339
## Cumulative % of var. 94.786 95.204 95.615 96.006 96.361 96.705 97.044
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 6.400 5.937 5.704 5.658 5.398 5.112 4.629
## % of var. 0.327 0.304 0.292 0.289 0.276 0.261 0.237
## Cumulative % of var. 97.372 97.675 97.967 98.256 98.532 98.794 99.030
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 4.424 4.038 3.882 3.514 3.103
## % of var. 0.226 0.206 0.199 0.180 0.159
## Cumulative % of var. 99.257 99.463 99.662 99.841 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr
## 1 | 32.524 | -13.061 0.607 0.161 | -8.055 0.543
## 2 | 27.856 | -10.027 0.358 0.130 | -3.802 0.121
## 3 | 39.443 | -2.753 0.027 0.005 | -5.020 0.211
## 4 | 32.636 | -13.645 0.662 0.175 | -6.195 0.321
## 5 | 34.700 | -11.193 0.446 0.104 | -2.435 0.050
## 6 | 76.822 | 70.444 17.654 0.841 | -3.040 0.077
## 7 | 39.426 | -9.359 0.312 0.056 | -2.057 0.035
## 8 | 35.511 | -2.385 0.020 0.005 | -2.629 0.058
## 9 | 31.664 | -5.558 0.110 0.031 | -9.805 0.804
## 10 | 36.701 | -5.651 0.114 0.024 | 2.991 0.075
## cos2 Dim.3 ctr cos2
## 1 0.061 | -7.037 0.784 0.047 |
## 2 0.019 | -8.158 1.054 0.086 |
## 3 0.016 | 0.528 0.004 0.000 |
## 4 0.036 | -9.149 1.326 0.079 |
## 5 0.005 | -7.381 0.863 0.045 |
## 6 0.002 | -8.994 1.281 0.014 |
## 7 0.003 | -3.208 0.163 0.007 |
## 8 0.005 | -0.141 0.000 0.000 |
## 9 0.096 | 2.393 0.091 0.006 |
## 10 0.007 | -8.613 1.175 0.055 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 553.3887_0.421 | 0.004 0.000 0.000 | 0.626 0.157 0.224 | 0.052
## 666.637_0.431 | 0.103 0.002 0.011 | 0.090 0.003 0.008 | -0.340
## 489.2284_0.496 | -0.231 0.009 0.040 | 0.496 0.099 0.184 | 0.174
## 696.649_0.442 | 0.047 0.000 0.001 | 0.504 0.102 0.112 | -0.153
## 297.1938_0.496 | 1.185 0.240 0.836 | -0.109 0.005 0.007 | -0.038
## 356.2792_0.463 | -0.440 0.033 0.126 | 0.096 0.004 0.006 | -0.098
## 349.1768_0.501 | 0.028 0.000 0.001 | -0.142 0.008 0.018 | -0.057
## 376.3184_0.456 | -0.463 0.037 0.089 | 0.234 0.022 0.023 | -0.373
## 572.3994_0.467 | -0.236 0.010 0.074 | -0.158 0.010 0.033 | 0.100
## 188.0705_0.471 | -0.317 0.017 0.052 | 0.267 0.029 0.037 | -0.101
## ctr cos2
## 553.3887_0.421 0.002 0.002 |
## 666.637_0.431 0.088 0.115 |
## 489.2284_0.496 0.023 0.023 |
## 696.649_0.442 0.018 0.010 |
## 297.1938_0.496 0.001 0.001 |
## 356.2792_0.463 0.007 0.006 |
## 349.1768_0.501 0.002 0.003 |
## 376.3184_0.456 0.106 0.058 |
## 572.3994_0.467 0.008 0.013 |
## 188.0705_0.471 0.008 0.005 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2
## 6101_U1_HILICPOS_59 | 32.524 | -13.061 0.161 -0.540 | -8.055 0.061
## 6101_U2_HILICPOS_30 | 35.207 | -9.193 0.068 -0.380 | -6.015 0.029
## 6101_U3_HILICPOS_29 | 28.936 | -9.532 0.109 -0.394 | -9.966 0.119
## 6101_U4_HILICPOS_14 | 32.217 | -11.100 0.119 -0.459 | -4.683 0.021
## 6102_U1_HILICPOS_26 | 27.856 | -10.027 0.130 -0.414 | -3.802 0.019
## 6102_U2_HILICPOS_16 | 30.721 | -7.607 0.061 -0.314 | -4.640 0.023
## 6102_U3_HILICPOS_48 | 38.090 | -10.278 0.073 -0.425 | -3.604 0.009
## 6102_U4_HILICPOS_50 | 30.369 | -9.303 0.094 -0.384 | -1.919 0.004
## 6103_U1_HILICPOS_21 | 39.443 | -2.753 0.005 -0.114 | -5.020 0.016
## 6103_U2_HILICPOS_60 | 36.118 | -6.110 0.029 -0.253 | -6.046 0.028
## v.test Dim.3 cos2 v.test
## 6101_U1_HILICPOS_59 -0.510 | -7.037 0.047 -0.614 |
## 6101_U2_HILICPOS_30 -0.381 | -4.210 0.014 -0.367 |
## 6101_U3_HILICPOS_29 -0.631 | -1.908 0.004 -0.166 |
## 6101_U4_HILICPOS_14 -0.297 | -11.678 0.131 -1.018 |
## 6102_U1_HILICPOS_26 -0.241 | -8.158 0.086 -0.711 |
## 6102_U2_HILICPOS_16 -0.294 | -2.585 0.007 -0.225 |
## 6102_U3_HILICPOS_48 -0.228 | -7.626 0.040 -0.665 |
## 6102_U4_HILICPOS_50 -0.122 | -16.349 0.290 -1.426 |
## 6103_U1_HILICPOS_21 -0.318 | 0.528 0.000 0.046 |
## 6103_U2_HILICPOS_60 -0.383 | 5.178 0.021 0.451 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_HILICPOS_59 | | | 32.524 | | | -13.061 | 0.161 | -0.540 | | | -8.055 | 0.061 | -0.510 | | | -7.037 | 0.047 | -0.614 | | |
| 6101_U2_HILICPOS_30 | | | 35.207 | | | -9.193 | 0.068 | -0.380 | | | -6.015 | 0.029 | -0.381 | | | -4.210 | 0.014 | -0.367 | | |
| 6101_U3_HILICPOS_29 | | | 28.936 | | | -9.532 | 0.109 | -0.394 | | | -9.966 | 0.119 | -0.631 | | | -1.908 | 0.004 | -0.166 | | |
| 6101_U4_HILICPOS_14 | | | 32.217 | | | -11.100 | 0.119 | -0.459 | | | -4.683 | 0.021 | -0.297 | | | -11.678 | 0.131 | -1.018 | | |
| 6102_U1_HILICPOS_26 | | | 27.856 | | | -10.027 | 0.130 | -0.414 | | | -3.802 | 0.019 | -0.241 | | | -8.158 | 0.086 | -0.711 | | |
| 6102_U2_HILICPOS_16 | | | 30.721 | | | -7.607 | 0.061 | -0.314 | | | -4.640 | 0.023 | -0.294 | | | -2.585 | 0.007 | -0.225 | | |
| 6102_U3_HILICPOS_48 | | | 38.090 | | | -10.278 | 0.073 | -0.425 | | | -3.604 | 0.009 | -0.228 | | | -7.626 | 0.040 | -0.665 | | |
| 6102_U4_HILICPOS_50 | | | 30.369 | | | -9.303 | 0.094 | -0.384 | | | -1.919 | 0.004 | -0.122 | | | -16.349 | 0.290 | -1.426 | | |
| 6103_U1_HILICPOS_21 | | | 39.443 | | | -2.753 | 0.005 | -0.114 | | | -5.020 | 0.016 | -0.318 | | | 0.528 | 0.000 | 0.046 | | |
| 6103_U2_HILICPOS_60 | | | 36.118 | | | -6.110 | 0.029 | -0.253 | | | -6.046 | 0.028 | -0.383 | | | 5.178 | 0.021 | 0.451 | | |
# pull PC coordinates into df
PC_coord_noQCs_log2 <- as.data.frame(PCA.DC_imp_metabind_clust_log2_noQCs$ind$coord)
# bind back metadata from cols 1-10
PC_coord_noQCs_log2 <- bind_cols(DC_imp_metabind_clust_log2_noQCs[,1:11], PC_coord_noQCs_log2)
# grab some variance explained
importance_noQC <- PCA.DC_imp_metabind_clust_log2_noQCs$eig
# set variance explained with PC1, round to 2 digits
PC1_noQC <- round(importance_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_noQC <- round(importance_noQC[2,2], 2)Using FactoExtra
# plot of contributions from features to PC1
(var_contrib_noQCs_PC1 <- fviz_contrib(PCA.DC_imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 1,
top = 25,
title = "Var contribution to PC1: no QCs"))# plot of contributions from features to PC2
(var_contrib_noQCs_PC2 <- fviz_contrib(PCA.DC_imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 2,
top = 25,
title = "Var contribution to PC2: no QCs"))(PCA_withoutQCs <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gold", "tomato1")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, without QCs"))(PCA_withoutQCs.pre_post <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed, without QCs"))Looks like subject 6106 and 6112 are outliers
# go back to wide data
DC_imp_nooutliers_log2 <- DC_imp_metabind_clust_log2 %>%
filter(Subject != 6106,
Subject != 6112)
PCA.DC_imp_nooutliers_log2 <- PCA(DC_imp_nooutliers_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
summary(PCA.DC_imp_nooutliers_log2)##
## Call:
## PCA(X = DC_imp_nooutliers_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 627.901 132.609 130.401 87.427 84.085 63.521 54.604
## % of var. 37.554 7.931 7.799 5.229 5.029 3.799 3.266
## Cumulative % of var. 37.554 45.485 53.284 58.513 63.542 67.341 70.607
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 49.255 46.791 40.151 36.198 27.295 26.088 22.043
## % of var. 2.946 2.799 2.401 2.165 1.633 1.560 1.318
## Cumulative % of var. 73.553 76.352 78.753 80.918 82.550 84.111 85.429
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 20.917 19.266 16.256 15.705 14.491 13.294 13.016
## % of var. 1.251 1.152 0.972 0.939 0.867 0.795 0.778
## Cumulative % of var. 86.680 87.832 88.805 89.744 90.611 91.406 92.184
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 11.730 10.631 10.158 9.760 9.024 7.765 7.199
## % of var. 0.702 0.636 0.608 0.584 0.540 0.464 0.431
## Cumulative % of var. 92.886 93.522 94.129 94.713 95.252 95.717 96.147
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 6.943 6.633 6.121 5.994 5.659 5.457 4.898
## % of var. 0.415 0.397 0.366 0.358 0.338 0.326 0.293
## Cumulative % of var. 96.563 96.959 97.326 97.684 98.022 98.349 98.642
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 4.779 4.227 4.101 3.989 3.394 0.521 0.486
## % of var. 0.286 0.253 0.245 0.239 0.203 0.031 0.029
## Cumulative % of var. 98.928 99.180 99.426 99.664 99.867 99.898 99.927
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 0.404 0.379 0.306 0.123 0.000
## % of var. 0.024 0.023 0.018 0.007 0.000
## Cumulative % of var. 99.952 99.974 99.993 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2
## 1 | 33.465 | -17.093 0.969 0.261 | 6.208
## 2 | 29.904 | -16.159 0.866 0.292 | 7.362
## 3 | 40.231 | -9.040 0.271 0.050 | -10.689
## 4 | 33.619 | -16.923 0.950 0.253 | 7.666
## 5 | 36.076 | -15.402 0.787 0.182 | 7.812
## 6 | 40.824 | -14.446 0.692 0.125 | 11.070
## 7 | 37.101 | -10.584 0.372 0.081 | -3.948
## 8 | 31.819 | -10.238 0.348 0.104 | -7.422
## 9 | 37.589 | -8.657 0.249 0.053 | 9.439
## 10 | 33.269 | -11.561 0.443 0.121 | -2.054
## ctr cos2 Dim.3 ctr cos2
## 1 0.605 0.034 | -5.882 0.553 0.031 |
## 2 0.851 0.061 | -1.130 0.020 0.001 |
## 3 1.795 0.071 | 19.667 6.179 0.239 |
## 4 0.923 0.052 | -1.064 0.018 0.001 |
## 5 0.959 0.047 | -2.751 0.121 0.006 |
## 6 1.925 0.074 | -17.441 4.860 0.183 |
## 7 0.245 0.011 | 10.093 1.628 0.074 |
## 8 0.865 0.054 | 4.872 0.379 0.023 |
## 9 1.400 0.063 | 5.211 0.434 0.019 |
## 10 0.066 0.004 | 1.279 0.026 0.001 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2
## 553.3887_0.421 | 0.418 0.028 0.137 | -0.019 0.000 0.000 |
## 666.637_0.431 | 0.181 0.005 0.039 | 0.280 0.059 0.093 |
## 489.2284_0.496 | 2.285 0.832 0.895 | 0.144 0.016 0.004 |
## 696.649_0.442 | 0.049 0.000 0.002 | 0.194 0.028 0.025 |
## 297.1938_0.496 | 1.569 0.392 0.899 | 0.041 0.001 0.001 |
## 356.2792_0.463 | 0.260 0.011 0.053 | -0.091 0.006 0.006 |
## 349.1768_0.501 | 0.123 0.002 0.014 | -0.194 0.028 0.035 |
## 376.3184_0.456 | -0.147 0.003 0.011 | 0.473 0.168 0.114 |
## 572.3994_0.467 | 0.011 0.000 0.000 | -0.177 0.024 0.047 |
## 188.0705_0.471 | 0.219 0.008 0.027 | 0.040 0.001 0.001 |
## Dim.3 ctr cos2
## 553.3887_0.421 0.424 0.138 0.141 |
## 666.637_0.431 0.131 0.013 0.020 |
## 489.2284_0.496 0.015 0.000 0.000 |
## 696.649_0.442 0.012 0.000 0.000 |
## 297.1938_0.496 0.050 0.002 0.001 |
## 356.2792_0.463 0.576 0.255 0.260 |
## 349.1768_0.501 0.476 0.174 0.214 |
## 376.3184_0.456 0.030 0.001 0.000 |
## 572.3994_0.467 0.061 0.003 0.006 |
## 188.0705_0.471 0.386 0.114 0.084 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2
## sample_ID_6101_U1_HILICPOS_59 | 33.465 | -17.093 0.261 -0.682 | 6.208
## sample_ID_6101_U2_HILICPOS_30 | 35.788 | -12.844 0.129 -0.513 | 3.822
## sample_ID_6101_U3_HILICPOS_29 | 30.728 | -17.283 0.316 -0.690 | -1.690
## sample_ID_6101_U4_HILICPOS_14 | 33.786 | -16.688 0.244 -0.666 | 9.983
## sample_ID_6102_U1_HILICPOS_26 | 29.904 | -16.159 0.292 -0.645 | 7.362
## sample_ID_6102_U2_HILICPOS_16 | 32.408 | -14.302 0.195 -0.571 | 0.731
## sample_ID_6102_U3_HILICPOS_48 | 38.451 | -12.056 0.098 -0.481 | 7.878
## sample_ID_6102_U4_HILICPOS_50 | 31.614 | -13.930 0.194 -0.556 | 13.105
## sample_ID_6103_U1_HILICPOS_21 | 40.231 | -9.040 0.050 -0.361 | -10.689
## sample_ID_6103_U2_HILICPOS_60 | 36.613 | -10.077 0.076 -0.402 | -12.601
## cos2 v.test Dim.3 cos2 v.test
## sample_ID_6101_U1_HILICPOS_59 0.034 0.539 | -5.882 0.031 -0.515 |
## sample_ID_6101_U2_HILICPOS_30 0.011 0.332 | -2.178 0.004 -0.191 |
## sample_ID_6101_U3_HILICPOS_29 0.003 -0.147 | -2.126 0.005 -0.186 |
## sample_ID_6101_U4_HILICPOS_14 0.087 0.867 | 0.370 0.000 0.032 |
## sample_ID_6102_U1_HILICPOS_26 0.061 0.639 | -1.130 0.001 -0.099 |
## sample_ID_6102_U2_HILICPOS_16 0.001 0.063 | 1.974 0.004 0.173 |
## sample_ID_6102_U3_HILICPOS_48 0.042 0.684 | -2.671 0.005 -0.234 |
## sample_ID_6102_U4_HILICPOS_50 0.172 1.138 | 8.632 0.075 0.756 |
## sample_ID_6103_U1_HILICPOS_21 0.071 -0.928 | 19.667 0.239 1.722 |
## sample_ID_6103_U2_HILICPOS_60 0.118 -1.094 | 13.110 0.128 1.148 |
# pull PC coordinates into df
PC_nooutliers_QC_log2 <- as.data.frame(PCA.DC_imp_nooutliers_log2$ind$coord)
# bind back metadata from cols 1-11
PC_nooutliers_QC_log2 <- bind_cols(DC_imp_nooutliers_log2[,1:11], PC_nooutliers_QC_log2)
# grab some variance explained
importance_nooutliers_QC <- PCA.DC_imp_nooutliers_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_withQC <- round(importance_nooutliers_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_withQC <- round(importance_nooutliers_QC[2,2], 2)Using FactoExtra package
##### Red vs yellow
# manual scores plot
(PCA_nooutliers_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot"))(PCA_nooutliers_prepost_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot"))imp_nooutliers_noQCs_log2 <- DC_imp_nooutliers_log2 %>%
filter(Intervention != "QC")
PCA.imp_nooutliers_noQCs_log2 <- PCA(imp_nooutliers_noQCs_log2, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.imp_nooutliers_noQCs_log2))##
## Call:
## PCA(X = imp_nooutliers_noQCs_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 162.991 156.695 108.060 100.946 76.717 66.115 60.723
## % of var. 12.878 12.380 8.538 7.976 6.061 5.224 4.798
## Cumulative % of var. 12.878 25.258 33.796 41.772 47.833 53.057 57.855
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 56.149 48.644 43.624 32.856 31.737 26.453 25.527
## % of var. 4.436 3.843 3.447 2.596 2.508 2.090 2.017
## Cumulative % of var. 62.291 66.135 69.581 72.177 74.685 76.775 78.792
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 24.429 19.506 18.859 18.098 16.052 15.765 14.078
## % of var. 1.930 1.541 1.490 1.430 1.268 1.246 1.112
## Cumulative % of var. 80.722 82.263 83.753 85.183 86.451 87.697 88.809
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 13.375 12.370 11.699 11.004 9.371 8.678 8.331
## % of var. 1.057 0.977 0.924 0.869 0.740 0.686 0.658
## Cumulative % of var. 89.866 90.843 91.768 92.637 93.377 94.063 94.721
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 8.001 7.452 7.301 6.802 6.686 5.874 5.755
## % of var. 0.632 0.589 0.577 0.537 0.528 0.464 0.455
## Cumulative % of var. 95.353 95.942 96.519 97.057 97.585 98.049 98.504
## Dim.36 Dim.37 Dim.38 Dim.39
## Variance 5.112 4.920 4.814 4.093
## % of var. 0.404 0.389 0.380 0.323
## Cumulative % of var. 98.908 99.296 99.677 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr
## 1 | 29.639 | -6.966 0.744 0.055 | -6.785 0.735
## 2 | 25.899 | -8.549 1.121 0.109 | -2.510 0.101
## 3 | 39.188 | 6.069 0.565 0.024 | 21.551 7.410
## 4 | 29.851 | -9.218 1.303 0.095 | -2.455 0.096
## 5 | 33.064 | -8.390 1.080 0.064 | -4.228 0.285
## 6 | 38.386 | -7.741 0.919 0.041 | -19.359 5.980
## 7 | 35.448 | 2.048 0.064 0.003 | 10.635 1.804
## 8 | 30.089 | 5.832 0.522 0.038 | 6.456 0.665
## 9 | 36.682 | -9.735 1.454 0.070 | 2.854 0.130
## 10 | 31.246 | 0.829 0.011 0.001 | 1.844 0.054
## cos2 Dim.3 ctr cos2
## 1 0.052 | -13.669 4.322 0.213 |
## 2 0.009 | -5.842 0.790 0.051 |
## 3 0.302 | 0.949 0.021 0.001 |
## 4 0.007 | -14.586 4.922 0.239 |
## 5 0.016 | -3.618 0.303 0.012 |
## 6 0.254 | 2.101 0.102 0.003 |
## 7 0.090 | 17.020 6.702 0.231 |
## 8 0.046 | 3.421 0.271 0.013 |
## 9 0.006 | 4.527 0.474 0.015 |
## 10 0.003 | -6.698 1.038 0.046 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 553.3887_0.421 | -0.045 0.001 0.002 | 0.452 0.130 0.157 | 0.094
## 666.637_0.431 | -0.333 0.068 0.117 | 0.071 0.003 0.005 | -0.286
## 489.2284_0.496 | 0.042 0.001 0.003 | -0.048 0.001 0.003 | -0.056
## 696.649_0.442 | -0.228 0.032 0.029 | -0.032 0.001 0.001 | -0.222
## 297.1938_0.496 | 0.073 0.003 0.020 | 0.025 0.000 0.002 | -0.024
## 356.2792_0.463 | -0.001 0.000 0.000 | 0.631 0.254 0.272 | 0.172
## 349.1768_0.501 | 0.113 0.008 0.010 | 0.553 0.195 0.247 | 0.108
## 376.3184_0.456 | -0.533 0.174 0.121 | -0.081 0.004 0.003 | -0.095
## 572.3994_0.467 | 0.163 0.016 0.033 | 0.111 0.008 0.015 | -0.211
## 188.0705_0.471 | -0.086 0.005 0.004 | 0.394 0.099 0.075 | 0.288
## ctr cos2
## 553.3887_0.421 0.008 0.007 |
## 666.637_0.431 0.075 0.087 |
## 489.2284_0.496 0.003 0.005 |
## 696.649_0.442 0.046 0.028 |
## 297.1938_0.496 0.001 0.002 |
## 356.2792_0.463 0.027 0.020 |
## 349.1768_0.501 0.011 0.009 |
## 376.3184_0.456 0.008 0.004 |
## 572.3994_0.467 0.041 0.056 |
## 188.0705_0.471 0.077 0.040 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2
## 6101_U1_HILICPOS_59 | 29.639 | -6.966 0.055 -0.546 | -6.785 0.052
## 6101_U2_HILICPOS_30 | 33.571 | -4.271 0.016 -0.335 | -2.824 0.007
## 6101_U3_HILICPOS_29 | 26.312 | -0.158 0.000 -0.012 | -1.345 0.003
## 6101_U4_HILICPOS_14 | 30.127 | -11.461 0.145 -0.898 | -1.635 0.003
## 6102_U1_HILICPOS_26 | 25.899 | -8.549 0.109 -0.670 | -2.510 0.009
## 6102_U2_HILICPOS_16 | 29.349 | -2.240 0.006 -0.175 | 1.917 0.004
## 6102_U3_HILICPOS_48 | 36.703 | -8.146 0.049 -0.638 | -4.207 0.013
## 6102_U4_HILICPOS_50 | 28.717 | -15.247 0.282 -1.194 | 5.541 0.037
## 6103_U1_HILICPOS_21 | 39.188 | 6.069 0.024 0.475 | 21.551 0.302
## 6103_U2_HILICPOS_60 | 35.133 | 9.055 0.066 0.709 | 15.644 0.198
## v.test Dim.3 cos2 v.test
## 6101_U1_HILICPOS_59 -0.542 | -13.669 0.213 -1.315 |
## 6101_U2_HILICPOS_30 -0.226 | -4.982 0.022 -0.479 |
## 6101_U3_HILICPOS_29 -0.107 | -11.847 0.203 -1.140 |
## 6101_U4_HILICPOS_14 -0.131 | -8.091 0.072 -0.778 |
## 6102_U1_HILICPOS_26 -0.201 | -5.842 0.051 -0.562 |
## 6102_U2_HILICPOS_16 0.153 | 1.991 0.005 0.192 |
## 6102_U3_HILICPOS_48 -0.336 | -9.108 0.062 -0.876 |
## 6102_U4_HILICPOS_50 0.443 | 1.738 0.004 0.167 |
## 6103_U1_HILICPOS_21 1.722 | 0.949 0.001 0.091 |
## 6103_U2_HILICPOS_60 1.250 | 2.204 0.004 0.212 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_HILICPOS_59 | | | 29.639 | | | -6.966 | 0.055 | -0.546 | | | -6.785 | 0.052 | -0.542 | | | -13.669 | 0.213 | -1.315 | | |
| 6101_U2_HILICPOS_30 | | | 33.571 | | | -4.271 | 0.016 | -0.335 | | | -2.824 | 0.007 | -0.226 | | | -4.982 | 0.022 | -0.479 | | |
| 6101_U3_HILICPOS_29 | | | 26.312 | | | -0.158 | 0.000 | -0.012 | | | -1.345 | 0.003 | -0.107 | | | -11.847 | 0.203 | -1.140 | | |
| 6101_U4_HILICPOS_14 | | | 30.127 | | | -11.461 | 0.145 | -0.898 | | | -1.635 | 0.003 | -0.131 | | | -8.091 | 0.072 | -0.778 | | |
| 6102_U1_HILICPOS_26 | | | 25.899 | | | -8.549 | 0.109 | -0.670 | | | -2.510 | 0.009 | -0.201 | | | -5.842 | 0.051 | -0.562 | | |
| 6102_U2_HILICPOS_16 | | | 29.349 | | | -2.240 | 0.006 | -0.175 | | | 1.917 | 0.004 | 0.153 | | | 1.991 | 0.005 | 0.192 | | |
| 6102_U3_HILICPOS_48 | | | 36.703 | | | -8.146 | 0.049 | -0.638 | | | -4.207 | 0.013 | -0.336 | | | -9.108 | 0.062 | -0.876 | | |
| 6102_U4_HILICPOS_50 | | | 28.717 | | | -15.247 | 0.282 | -1.194 | | | 5.541 | 0.037 | 0.443 | | | 1.738 | 0.004 | 0.167 | | |
| 6103_U1_HILICPOS_21 | | | 39.188 | | | 6.069 | 0.024 | 0.475 | | | 21.551 | 0.302 | 1.722 | | | 0.949 | 0.001 | 0.091 | | |
| 6103_U2_HILICPOS_60 | | | 35.133 | | | 9.055 | 0.066 | 0.709 | | | 15.644 | 0.198 | 1.250 | | | 2.204 | 0.004 | 0.212 | | |
# pull PC coordinates into df
PC_coord_nooutliers_noQC_log2 <- as.data.frame(PCA.imp_nooutliers_noQCs_log2$ind$coord)
# bind back metadata from cols 1-11
PC_coord_nooutliers_noQC_log2 <- bind_cols(imp_nooutliers_noQCs_log2[,1:11], PC_coord_nooutliers_noQC_log2)
# grab some variance explained
importance_nooutliers_noQC <- PCA.imp_nooutliers_noQCs_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_noQC <- round(importance_nooutliers_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_noQC <- round(importance_nooutliers_noQC[2,2], 2)Using FactoExtra
# plot of contributions from features to PC1
(var_contrib_nooutliers_noQCs_PC1 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 1,
top = 20,
title = "Var contribution to PC1: no outliers, no QCs"))# plot of contributions from features to PC2
(var_contrib_nooutliers_noQCs_PC2 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 2,
top = 20,
title = "Var contribution to PC2: no outliers, no QCs"))(PCA_nooutliers_withoutQCs <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("tomato1", "gold")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot"))(PCA_nooutliers_noQCs.prepost <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no outliers"))(PCA_nooutliers_noQCs.MvsF <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(Sex, levels = c("M","F")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("green", "pink")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no 6106")) if (!requireNamespace('BiocManager', quietly = TRUE))
install.packages('BiocManager')
BiocManager::install('PCAtools')# create rel abund df suitable for PCAtools package
DC_imp_clust_omicsdata_outliers_forPCAtools <- as.data.frame(t(DC_imp_metabind_clust_log2[,-c(2:11)])) # transpose df
names(DC_imp_clust_omicsdata_outliers_forPCAtools) <- DC_imp_clust_omicsdata_outliers_forPCAtools[1,] # make sample IDs column names
DC_imp_clust_omicsdata_outliers_forPCAtools <- DC_imp_clust_omicsdata_outliers_forPCAtools[-1,] # remove sample ID row
# create metadata df suitable for PCAtools pckg
metadata_outliers_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_outliers_forPCAtools <- match(rownames(metadata_outliers_forPCAtools), colnames(DC_imp_clust_omicsdata_outliers_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_outliers_reordered_forPCAtools <- DC_imp_clust_omicsdata_outliers_forPCAtools[ ,order_outliers_forPCAtools]
# change abundance df to numeric
abundata_outliers_reordered_forPCAtools <- abundata_outliers_reordered_forPCAtools %>%
mutate_all(as.numeric)
# Log transform
log2_abundata_outliers_forPCAtools <- log2(abundata_outliers_reordered_forPCAtools)
# unite pre_post column with intervention column to create pre_intervention column
metadata_outliers_forPCAtools <- metadata_outliers_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)# pca
p_outliers <- PCAtools::pca(log2_abundata_outliers_forPCAtools,
metadata = metadata_outliers_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)
# plot
PCAtools::biplot(p_outliers,
showLoadings = TRUE, # show variables that contribute the most to PCs
lab = NULL,
title = ) biplot(p_outliers,
lab = paste0(metadata_outliers_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
# ellipse config
ellipse = TRUE,
ellipseType = 't',
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 1.0,
xlim = c(-25,25), ylim = c(-10, 10),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot with 95% Confidence Interval",
subtitle = "Log2 transformed data, HILIC (+), with outliers, no QCs")# create rel abund df suitable for PCAtools package
DC_imp_clust_omicsdata_forPCAtools <- as.data.frame(t(DC_imp_metabind_clust_log2[,-c(2:11)])) # transpose df
names(DC_imp_clust_omicsdata_forPCAtools) <- DC_imp_clust_omicsdata_forPCAtools[1,] # make sample IDs column names
DC_imp_clust_omicsdata_forPCAtools <- DC_imp_clust_omicsdata_forPCAtools[-1,] # remove sample ID row
DC_imp_clust_omicsdata_forPCAtools <- DC_imp_clust_omicsdata_forPCAtools %>%
dplyr::select(!contains("QC")) # remove QC observations
# create metadata df suitable for PCAtools pckg
metadata_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_forPCAtools <- match(rownames(metadata_forPCAtools), colnames(DC_imp_clust_omicsdata_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_reordered_forPCAtools <- DC_imp_clust_omicsdata_forPCAtools[ ,order_forPCAtools]
# change abundance df to numeric
abundata_reordered_forPCAtools <- abundata_reordered_forPCAtools %>%
mutate_all(as.numeric)
# Log transform
log2_abundata_forPCAtools <- log2(abundata_reordered_forPCAtools)
# remove outlier subj from both df
log2_abundata_forPCAtools <- log2_abundata_forPCAtools %>%
dplyr::select(!contains("6106")) %>%
dplyr::select(!contains("6112"))
metadata_forPCAtools <- metadata_forPCAtools %>%
filter(Subject != 6106,
Subject != 6112)
# unite pre_post column with intervention column to create pre_intervention column
metadata_forPCAtools <- metadata_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)# pca
p <- PCAtools::pca(log2_abundata_forPCAtools,
metadata = metadata_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)
# plot
PCAtools::biplot(p,
showLoadings = TRUE, # show variables that contribute the most to PCs
lab = NULL,
title = )Horn’s parallel analysis
## [1] 10
Elbow method
## PC6
## 6
screeplot(p,
components = getComponents(p, 1:20),
vline = c(horn$n, elbow)) +
geom_label(aes(x = horn$n + 1, y = 50,
label = 'Horn\'s', vjust = -1, size = 8)) +
geom_label(aes(x = elbow + 1, y = 50,
label = 'Elbow method', vjust = -3, size = 8))How many PCs do we need to capture at least 80% variance?
## PC16
## 16
biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
# ellipse config
ellipse = TRUE,
ellipseType = 't', # assumes multivariate t-distribution
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 0,
xlim = c(-7,5), ylim = c(-5,5),
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, HILIC (+), outliers removed, no QCs \n95% confidence level ellipses")(PCA.colby.prevspost <- biplot(p,
lab = NULL,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, HILIC (+), outliers removed, no QCs \n95% confidence level ellipses",
showLoadings = TRUE))(PCA_pairsplot.colby.prevspost <-
pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "pink",
"post_Red" = "red4"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm')))(PCA.colby.Sex <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.overall.prevspost <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post',
colkey = c("pre" = "orange",
"post" = "green3"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post',
colkey = c("pre" = "orange",
"post" = "green3"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.period <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Period',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Period',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.sequence <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'sequence',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'sequence',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))This is a cool way to explore the correlations between the metadata and the PCs! I want to look at how the metavariables correlate with PCs that account for 80% variation in the dataset.
Again: How many PCs do we need to capture at least 80% variance?
## PC16
## 16
eigencorplot(p,
components = getComponents(p, 1:15), # get components that account for 80% variance
metavars = colnames(metadata_forPCAtools),
col = c('darkblue', 'blue2', 'gray', 'red2', 'darkred'),
cexCorval = 0.7,
colCorval = 'white',
fontCorval = 2,
posLab = 'bottomleft',
rotLabX = 45,
posColKey = 'top',
cexLabColKey = 1.5,
scale = TRUE,
main = 'PC1-14 metadata correlations',
colFrame = 'white',
plotRsquared = FALSE) eigencorplot(p,
components = getComponents(p, 1:15),
metavars = colnames(metadata_forPCAtools),
col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
cexCorval = 1.2,
fontCorval = 2,
posLab = 'all',
rotLabX = 45,
scale = TRUE,
main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ metadata ~ correlates),
plotRsquared = TRUE,
corFUN = 'pearson',
corUSE = 'pairwise.complete.obs',
corMultipleTestCorrection = 'BH',
signifSymbols = c('****', '***', '**', '*', ''),
signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))I am most interested in PCs affected by pre_post_intervention, so I think it would be good to investigate the metabolites that contribute the most to these PCs.
# loadings plot for PCs 8 and 9
plotloadings(p,
components = getComponents(p, c(8,9)),
rangeRetain = 0.1, absolute = TRUE,
col = c('black', 'pink', 'red4'),
drawConnectors = TRUE, labSize = 3,
title = "Loadings plot",
subtitle = "PC 8 and PC 9",
caption = "Pre_post_intervention is highly correlated with these PCs") + coord_flip()Data_forMPCA <- DC_imp_metabind_clust_log2_noQCs %>%
mutate_at("Subject", as.factor)
summary(as.factor(Data_forMPCA$Subject))## 6101 6102 6103 6104 6105 6106 6108 6109 6110 6111 6112 6113
## 4 4 4 4 4 4 4 4 4 4 4 4
## [1] "sample_ID" "Subject" "Period"
## [4] "pre_post_intervention" "Intervention" "pre_post"
## [7] "sequence" "Intervention_week" "Sex"
## [10] "Age" "BMI"
mixOmicsPCA.result <- mixOmics::pca(Data_forMPCA[,!names(Data_forMPCA) %in% metavar],
scale = FALSE,
center = FALSE)
plotIndiv(mixOmicsPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post_intervention,
legend = TRUE,
legend.title = "Treatment",
title = 'Regular PCA, HILIC (+), Log2 transformed')With all data
multilevelPCA.result <- mixOmics::pca(Data_forMPCA[,-(c(1:11))],
multilevel = Data_forMPCA$Subject,
scale = FALSE,
center = FALSE)
plotIndiv(multilevelPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post_intervention,
legend = TRUE,
legend.title = "Treatment",
title = 'Multilevel PCA, HILIC (+), Log2 transformed')multilevelPCA.result <- mixOmics::pca(Data_forMPCA[,-(c(1:11))],
multilevel = Data_forMPCA$Subject,
scale = FALSE,
center = FALSE)
plotIndiv(multilevelPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post,
legend = TRUE,
legend.title = "Treatment",
title = 'Multilevel PCA, HILIC (+), Log2 transformed')use tidy data
# tidy df
DC_imp_metabind_clust_tidy_log2 <- DC_imp_metabind_clust_log2 %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")
# use tidy data
head(DC_imp_metabind_clust_tidy_log2)## # A tibble: 6 × 13
## sample_ID Subject Period pre_post_intervention Intervention pre_post sequence
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 6101_U1_H… 6101 U1 pre_Red Red pre R_Y
## 2 6101_U1_H… 6101 U1 pre_Red Red pre R_Y
## 3 6101_U1_H… 6101 U1 pre_Red Red pre R_Y
## 4 6101_U1_H… 6101 U1 pre_Red Red pre R_Y
## 5 6101_U1_H… 6101 U1 pre_Red Red pre R_Y
## 6 6101_U1_H… 6101 U1 pre_Red Red pre R_Y
## # ℹ 6 more variables: Intervention_week <chr>, Sex <chr>, Age <chr>, BMI <chr>,
## # mz_rt <chr>, rel_abund_log2 <dbl>
# remove QCs
df_for_stats <- DC_imp_metabind_clust_tidy_log2 %>%
filter(Intervention != "QC")
# check if QCs were removed
unique(df_for_stats$Intervention)## [1] "Red" "Yellow"
# df without outliers
df_for_stats_noOutlier <- df_for_stats %>%
filter(Subject != "6106",
Subject != "6112")
# check if outlier was removed
unique(df_for_stats_noOutlier$Subject)## [1] "6101" "6102" "6103" "6104" "6105" "6108" "6109" "6110" "6111" "6113"
anova_outpout_df <- df_for_stats_noOutlier %>%
dplyr::select(Subject, pre_post_intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
anova_test(rel_abund_log2 ~ pre_post_intervention, wid = Subject,
detailed = TRUE) %>%
adjust_pvalue(method = "BH") %>%
as.data.frame()
anova_sig <- anova_outpout_df %>%
filter(p.adj <= 0.05)
# how many significant features?
nrow(anova_sig)## [1] 19
ANOVA_heatmap_data <- DC_imp_metabind_clust_log2_noQCs %>%
unite("Subject_pre_post_intervention", Subject, pre_post_intervention, sep = "_", remove = FALSE) %>%
dplyr::select(Subject, Subject_pre_post_intervention, pre_post, all_of(anova_sig$mz_rt)) %>%
# remove outliers
filter(Subject != 6106,
Subject != 6112)
ANOVA_heatmap <-
pheatmap(ANOVA_heatmap_data[,-c(1:3)],
scale = "column",
cluster_rows = TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
labels_row = ANOVA_heatmap_data$Subject_pre_post_intervention,
color = colorRampPalette(c("#67a9cf", "#f7f7f7", "#ef8a62"))(16),
main = "Heatmap of features significant across timepoints \nby repeated measures one-way ANOVA \nBenjamoni-Hochberg corrected p-values > 0.05 \nHILIC (+)")Here, I am comparing pre- to post-intervention for both yellow and tomato soy (Red) juice interventions. I also compare post-yellow to post-red intervention. I am using the log transformed values of rel abundance since parametric tests assume normality.
# run paired t-tests for control intervention
ctrl_t.test_paired <- df_for_stats %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_t.test_paired_sig <- ctrl_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_t.test_paired_sig)## # A tibble: 228 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 104.0706_5.… rel_… post pre 12 12 -3.56 11 0.00449 **
## 2 104.0818_4.… rel_… post pre 12 12 -3.06 11 0.0109 *
## 3 115.0866_4.… rel_… post pre 12 12 -3.47 11 0.00519 **
## 4 121.0647_3.… rel_… post pre 12 12 -3.80 11 0.00297 **
## 5 123.0553_5 rel_… post pre 12 12 -3.73 11 0.00332 **
## 6 128.0819_2.… rel_… post pre 12 12 -3.49 11 0.00509 **
## 7 131.118_4.7… rel_… post pre 12 12 -2.62 11 0.0236 *
## 8 132.0771_6.… rel_… post pre 12 12 -3.31 11 0.00689 **
## 9 134.06_3.798 rel_… post pre 12 12 -2.21 11 0.0495 *
## 10 138.0556_4.… rel_… post pre 12 12 2.44 11 0.033 *
## # ℹ 218 more rows
## [1] 228
# run paired t-tests for control intervention
ctrl_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_noOutlier_t.test_paired_sig <- ctrl_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_noOutlier_t.test_paired_sig)## # A tibble: 178 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 104.0706_5.… rel_… post pre 10 10 -4.89 9 8.55e-4 ***
## 2 104.0818_4.… rel_… post pre 10 10 -2.92 9 1.7 e-2 *
## 3 115.0866_4.… rel_… post pre 10 10 -2.94 9 1.66e-2 *
## 4 121.0647_3.… rel_… post pre 10 10 -4.14 9 2.54e-3 **
## 5 123.0553_5 rel_… post pre 10 10 -3.21 9 1.06e-2 *
## 6 124.0759_2.… rel_… post pre 10 10 -2.48 9 3.5 e-2 *
## 7 128.0819_2.… rel_… post pre 10 10 -3.36 9 8.44e-3 **
## 8 130.0499_4.… rel_… post pre 10 10 -2.39 9 4.08e-2 *
## 9 132.0771_6.… rel_… post pre 10 10 -2.64 9 2.71e-2 *
## 10 137.071_3.7… rel_… post pre 10 10 -2.30 9 4.68e-2 *
## # ℹ 168 more rows
## [1] 178
# run paired t-tests for control intervention
red_t.test_paired <- df_for_stats %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_t.test_paired_sig <- red_t.test_paired %>%
filter(p <= 0.05)
tibble(red_t.test_paired_sig)## # A tibble: 142 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 116.1073_1.… rel_… post pre 12 12 -3.50 11 4.95e-3 **
## 2 116.1171_1.… rel_… post pre 12 12 -3.34 11 6.58e-3 **
## 3 138.0556_4.… rel_… post pre 12 12 2.94 11 1.34e-2 *
## 4 138.055_4.1… rel_… post pre 12 12 5.83 11 1.15e-4 ***
## 5 144.1021_3.… rel_… post pre 12 12 -2.46 11 3.16e-2 *
## 6 144.1028_4.… rel_… post pre 12 12 -2.37 11 3.75e-2 *
## 7 146.0812_0.… rel_… post pre 12 12 -3.60 11 4.14e-3 **
## 8 146.0813_1.… rel_… post pre 12 12 -3.03 11 1.14e-2 *
## 9 146.0921_5.… rel_… post pre 12 12 2.99 11 1.22e-2 *
## 10 150.0694_1.… rel_… post pre 12 12 -2.61 11 2.41e-2 *
## # ℹ 132 more rows
## [1] 142
# run paired t-tests for control intervention
red_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_noOutlier_t.test_paired_sig <- red_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(red_noOutlier_t.test_paired_sig)## # A tibble: 145 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 100.0757_0.… rel_… post pre 10 10 -2.33 9 4.45e-2 *
## 2 104.0818_4.… rel_… post pre 10 10 -4.88 9 8.7 e-4 ***
## 3 116.1073_1.… rel_… post pre 10 10 -5.53 9 3.66e-4 ***
## 4 116.1171_1.… rel_… post pre 10 10 -2.97 9 1.57e-2 *
## 5 118.0867_4.… rel_… post pre 10 10 -2.40 9 3.97e-2 *
## 6 138.0556_4.… rel_… post pre 10 10 5.11 9 6.36e-4 ***
## 7 138.055_4.1… rel_… post pre 10 10 9.11 9 7.70e-6 ****
## 8 144.1028_4.… rel_… post pre 10 10 -2.44 9 3.76e-2 *
## 9 146.0812_0.… rel_… post pre 10 10 -3.34 9 8.63e-3 **
## 10 146.0813_1.… rel_… post pre 10 10 -3.02 9 1.44e-2 *
## # ℹ 135 more rows
## [1] 145
# run paired t-tests for post interventions
post_t.test_paired <- df_for_stats %>%
filter(pre_post == "post") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_t.test_paired_sig <- post_t.test_paired %>%
filter(p <= 0.05)
tibble(post_t.test_paired_sig)## # A tibble: 38 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 104.0706_5.… rel_… Red Yellow 12 12 2.43 11 0.0335 *
## 2 121.0647_3.… rel_… Red Yellow 12 12 2.21 11 0.049 *
## 3 137.071_3.7… rel_… Red Yellow 12 12 3.27 11 0.00748 **
## 4 145.1699_7.… rel_… Red Yellow 12 12 -2.39 11 0.0357 *
## 5 146.0925_2.… rel_… Red Yellow 12 12 2.58 11 0.0258 *
## 6 147.0474_4.… rel_… Red Yellow 12 12 2.81 11 0.017 *
## 7 153.1273_3.… rel_… Red Yellow 12 12 -2.48 11 0.0307 *
## 8 182.0811_5.… rel_… Red Yellow 12 12 3.29 11 0.00719 **
## 9 188.0705_0.… rel_… Red Yellow 12 12 -2.30 11 0.0422 *
## 10 202.1187_4.… rel_… Red Yellow 12 12 2.41 11 0.0344 *
## # ℹ 28 more rows
## [1] 38
# run paired t-tests for post interventions
post_noOutlier_t.test_paired <- df_for_stats %>%
filter(pre_post == "post",
Subject != "6106") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_noOutlier_t.test_paired_sig <- post_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(post_noOutlier_t.test_paired_sig)## # A tibble: 44 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 104.0706_5.3… rel_… Red Yellow 11 11 3.04 10 0.0125 *
## 2 128.0819_2.6… rel_… Red Yellow 11 11 2.47 10 0.0331 *
## 3 142.0863_0.6… rel_… Red Yellow 11 11 2.31 10 0.0435 *
## 4 143.0816_5.3… rel_… Red Yellow 11 11 3.35 10 0.0074 **
## 5 144.1021_3.8… rel_… Red Yellow 11 11 -2.41 10 0.0366 *
## 6 146.0925_2.6… rel_… Red Yellow 11 11 3.06 10 0.012 *
## 7 147.0474_4.9… rel_… Red Yellow 11 11 2.74 10 0.0209 *
## 8 153.1273_3.8… rel_… Red Yellow 11 11 -2.32 10 0.0431 *
## 9 170.1005_6.9… rel_… Red Yellow 11 11 -2.50 10 0.0315 *
## 10 182.0811_5.07 rel_… Red Yellow 11 11 2.28 10 0.0456 *
## # ℹ 34 more rows
## [1] 44
Are there any significant features shared between tests with and without outlier?
post_sig_outlier_comp <- list(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig) %>%
reduce(inner_join, by = "mz_rt")
tibble(post_sig_outlier_comp)## # A tibble: 29 × 19
## mz_rt .y..x group1.x group2.x n1.x n2.x statistic.x df.x p.x
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 104.0706_5.342 rel_a… Red Yellow 11 11 3.04 10 0.0125
## 2 146.0925_2.629 rel_a… Red Yellow 11 11 3.06 10 0.012
## 3 147.0474_4.904 rel_a… Red Yellow 11 11 2.74 10 0.0209
## 4 153.1273_3.887 rel_a… Red Yellow 11 11 -2.32 10 0.0431
## 5 182.0811_5.07 rel_a… Red Yellow 11 11 2.28 10 0.0456
## 6 188.0705_0.471 rel_a… Red Yellow 11 11 -4.06 10 0.00229
## 7 208.0968_0.848 rel_a… Red Yellow 11 11 -3.49 10 0.00587
## 8 229.1547_3.637 rel_a… Red Yellow 11 11 2.58 10 0.0275
## 9 229.1553_4.5 rel_a… Red Yellow 11 11 2.71 10 0.0219
## 10 230.1247_5.055 rel_a… Red Yellow 11 11 4.04 10 0.00236
## # ℹ 19 more rows
## # ℹ 10 more variables: p.signif.x <chr>, .y..y <chr>, group1.y <chr>,
## # group2.y <chr>, n1.y <int>, n2.y <int>, statistic.y <dbl>, df.y <dbl>,
## # p.y <dbl>, p.signif.y <chr>
## [1] 29
# return sig features present only in df with outlier, and not in df without outlier
tibble(anti_join(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig,
by = "mz_rt"))## # A tibble: 15 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 128.0819_2.6… rel_… Red Yellow 11 11 2.47 10 0.0331 *
## 2 142.0863_0.6… rel_… Red Yellow 11 11 2.31 10 0.0435 *
## 3 143.0816_5.3… rel_… Red Yellow 11 11 3.35 10 0.0074 **
## 4 144.1021_3.8… rel_… Red Yellow 11 11 -2.41 10 0.0366 *
## 5 170.1005_6.9… rel_… Red Yellow 11 11 -2.50 10 0.0315 *
## 6 227.1262_3.5… rel_… Red Yellow 11 11 2.44 10 0.0351 *
## 7 227.1263_3.4… rel_… Red Yellow 11 11 2.45 10 0.0345 *
## 8 241.0948_5.1… rel_… Red Yellow 11 11 2.27 10 0.0464 *
## 9 243.1339_2.7… rel_… Red Yellow 11 11 2.49 10 0.0322 *
## 10 274.204_0.853 rel_… Red Yellow 11 11 -2.55 10 0.029 *
## 11 286.1283_4.1… rel_… Red Yellow 11 11 2.72 10 0.0216 *
## 12 324.2159_0.4… rel_… Red Yellow 11 11 -2.55 10 0.0291 *
## 13 356.2792_0.4… rel_… Red Yellow 11 11 -2.60 10 0.0264 *
## 14 406.1702_4.5… rel_… Red Yellow 11 11 -2.47 10 0.0333 *
## 15 86.06_5.38 rel_… Red Yellow 11 11 2.25 10 0.0478 *
# return sig features from df without outlier that are also present in df with outlier
kable(semi_join(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig,
by = "mz_rt"))| mz_rt | .y. | group1 | group2 | n1 | n2 | statistic | df | p | p.signif |
|---|---|---|---|---|---|---|---|---|---|
| 104.0706_5.342 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.036895 | 10 | 0.0125000 | * |
| 146.0925_2.629 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.061303 | 10 | 0.0120000 | * |
| 147.0474_4.904 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.736832 | 10 | 0.0209000 | * |
| 153.1273_3.887 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.315656 | 10 | 0.0431000 | * |
| 182.0811_5.07 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.282133 | 10 | 0.0456000 | * |
| 188.0705_0.471 | rel_abund_log2 | Red | Yellow | 11 | 11 | -4.059869 | 10 | 0.0022900 | ** |
| 208.0968_0.848 | rel_abund_log2 | Red | Yellow | 11 | 11 | -3.485203 | 10 | 0.0058700 | ** |
| 229.1547_3.637 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.577394 | 10 | 0.0275000 | * |
| 229.1553_4.5 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.711421 | 10 | 0.0219000 | * |
| 230.1247_5.055 | rel_abund_log2 | Red | Yellow | 11 | 11 | 4.039951 | 10 | 0.0023600 | ** |
| 230.1863_4.368 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.928834 | 10 | 0.0151000 | * |
| 234.1136_0.825 | rel_abund_log2 | Red | Yellow | 11 | 11 | -6.021853 | 10 | 0.0001280 | *** |
| 241.1547_4.545 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.592767 | 10 | 0.0268000 | * |
| 254.1498_4.993 | rel_abund_log2 | Red | Yellow | 11 | 11 | 8.107799 | 10 | 0.0000105 | **** |
| 259.0977_3.914 | rel_abund_log2 | Red | Yellow | 11 | 11 | 17.233573 | 10 | 0.0000000 | **** |
| 267.1102_4.521 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.423222 | 10 | 0.0359000 | * |
| 267.1104_4.411 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.312584 | 10 | 0.0433000 | * |
| 280.1183_0.99 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.528365 | 10 | 0.0300000 | * |
| 303.1914_5.064 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.722475 | 10 | 0.0215000 | * |
| 312.2284_1.053 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.396897 | 10 | 0.0375000 | * |
| 431.0968_4.511 | rel_abund_log2 | Red | Yellow | 11 | 11 | 20.295177 | 10 | 0.0000000 | **** |
| 433.1123_4.289 | rel_abund_log2 | Red | Yellow | 11 | 11 | 13.494735 | 10 | 0.0000001 | **** |
| 433.2365_1.214 | rel_abund_log2 | Red | Yellow | 11 | 11 | 10.921157 | 10 | 0.0000007 | **** |
| 447.0919_4.001 | rel_abund_log2 | Red | Yellow | 11 | 11 | 12.134606 | 10 | 0.0000003 | **** |
| 449.1074_3.962 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.288715 | 10 | 0.0451000 | * |
| 461.1072_4.293 | rel_abund_log2 | Red | Yellow | 11 | 11 | 27.697228 | 10 | 0.0000000 | **** |
| 469.0527_4.611 | rel_abund_log2 | Red | Yellow | 11 | 11 | 17.117670 | 10 | 0.0000000 | **** |
| 540.4236_0.454 | rel_abund_log2 | Red | Yellow | 11 | 11 | -5.762674 | 10 | 0.0001820 | *** |
| 607.1285_8.059 | rel_abund_log2 | Red | Yellow | 11 | 11 | 24.091233 | 10 | 0.0000000 | **** |
Here, I want to only focus on the metabolites that I hypothesized to change. This way I can avoid multiple correction adjustments and see if I can capture any significant differences at different timepoints
stds_df_for_stats_wide <- df_for_stats %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2) %>%
dplyr::select(c(1:11),
"118.0867_4.678", #valine
"147.0765_6.582", #glutamine
"162.1129_5.651", #l-carnitine
"166.0861_4.404", #phenylalanine
"182.0808_5.647", #tyrosine
"147.1129_8.251", #lysine
"156.0772_7.669", #histidine
"205.0972_4.752", #tryptophan
"175.119_8.106", #arginine
"241.0311_8.284" #cystine
)
# make tidy df
stds_df_forstats_tidy <- stds_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
stds_df_forstats_tidy$pre_post_intervention <- factor(stds_df_forstats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(stds_df_forstats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
stds_df_forstats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), colour = "purple", linewidth = 0.2) +
facet_wrap(vars(mz_rt), scales = "free_y") +
theme_bw() # run paired t-tests for before vs. aftet control intervention
stds_ctrl_t.test_paired <- stds_df_forstats_tidy %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
filter(Subject != c(6106, 6112)) %>% # remove outliers
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE) %>%
add_significance()Statistically significant features
# which features are significant?
stds_ctrl_t.test_paired_sig <- stds_ctrl_t.test_paired %>%
filter(p <= 0.05)
tibble(stds_ctrl_t.test_paired_sig)## # A tibble: 3 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 162.1129_5.6… rel_… post pre 11 11 -3.07 10 0.0118 *
## 2 166.0861_4.4… rel_… post pre 11 11 -3.20 10 0.00944 **
## 3 205.0972_4.7… rel_… post pre 11 11 -2.48 10 0.0324 *
## [1] 3
# run paired t-tests for before vs. after red intervention
stds_red_t.test_paired <- stds_df_forstats_tidy %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
filter(Subject != c(6106, 6112)) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE) %>%
add_significance()Statistically significant features
# which features are significant?
stds_red_t.test_paired_sig <- stds_red_t.test_paired %>%
filter(p <= 0.05)
tibble(stds_red_t.test_paired_sig)## # A tibble: 2 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 205.0972_4.752 rel_… post pre 11 11 -2.34 10 0.0414 *
## 2 241.0311_8.284 rel_… post pre 11 11 -2.40 10 0.0375 *
## [1] 2
# run paired t-tests for post-red vs. post-control intervention
stds_post_t.test_paired <- stds_df_forstats_tidy %>%
filter(pre_post == "post") %>%
dplyr::select(Subject, pre_post_intervention, mz_rt, rel_abund_log2) %>%
filter(Subject != c(6106, 6112)) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post_intervention,
paired = TRUE) %>%
add_significance()Statistically significant features
# which features are significant?
stds_post_t.test_paired_sig <- stds_post_t.test_paired %>%
filter(p <= 0.05)
tibble(stds_post_t.test_paired_sig)## # A tibble: 0 × 10
## # ℹ 10 variables: mz_rt <chr>, .y. <chr>, group1 <chr>, group2 <chr>, n1 <int>,
## # n2 <int>, statistic <dbl>, df <dbl>, p <dbl>, p.signif <chr>
## [1] 0
# add regular relative abundance levels back into the df
df_for_stats <- df_for_stats %>%
mutate(rel_abund = 2^(rel_abund_log2))
# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_v_yel_volcano_data_noOutlier <- df_for_stats %>%
filter(pre_post == "post",
Subject != 6106,
Subject != 6112) %>% # remove outlier subj
group_by(Intervention, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = Intervention, values_from = mean_rel_abund) %>%
mutate(FC_postRed_div_postYellow = Red/Yellow)
# bind back pval
red_v_yel_tobind_noOutlier <- post_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_v_yel_volcano_data_noOutlier <-
bind_cols(red_v_yel_volcano_data_noOutlier, red_v_yel_tobind_noOutlier) %>%
mutate(log2_FC_postRed_div_postYellow = if_else(FC_postRed_div_postYellow > 0,
log2(FC_postRed_div_postYellow),
-(log2(abs(FC_postRed_div_postYellow)))),
neglog10p = -log10(p))
# create a df of features passing FC and pval cutoffs higher in post-Red
postRed_sig_red_v_yel_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow >= 0.6)
# create a df of features passing FC and pval cutoffs higher in post-control
postYellow_sig_red_v_yel_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow <= -0.6) Create feature lists with significant features along with their clusters
(red_v_yellow_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_postRed_div_postYellow, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change tomato/control: {round(FC_postRed_div_postYellow, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = postRed_sig_intrvntn_comp_clusters,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "tomato") +
geom_point(data = postYellow_sig_intrvntn_comp_clusters,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 0.6, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -0.6, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy and Control Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption while yellow points are higher \nafter control tomato juice consumption. Subjects 6106 and 6112 removed",
caption = "Vertical dashed lines represent a log2 fold change > 0.6 or < -0.6, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (TomatoSoy/Control)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_v_yellow_volcano_ggplotly_noOutlier <- ggplotly(red_v_yellow_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_v_yellow_volcano_noOutlier,
filename = "plots and figures/volcano plots/red_v_yellow_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = red_v_yellow_volcano_ggplotly_noOutlier,
file = "plots and figures/volcano plots/interactive_redvyellow_volcano_plot_noOutlier.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Red",
Subject != 6106,
Subject != 6112) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
red_tobind_noOutlier <- red_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_volcano_data_noOutlier <-
bind_cols(red_volcano_data_noOutlier, red_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = log2(FC_post_div_pre),
neglog10p = -log10(p))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
red_pre_v_post_volcano_noOutlier <- red_volcano_data_noOutlier %>%
filter(p <= 0.05 & abs(log2_FC_post_div_pre) >= 0.6)Create feature lists with significant features along with their clusters
(red_volcano_noOutlier <- red_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = red_sig_prepost_comp_clusters,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "tomato") +
geom_vline(xintercept = 0.6, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -0.6, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 6)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption when compared to prior to consumption. \nSubjects 6106 and 6112 removed",
caption = "Vertical dashed lines represent an abs(log fold change) > 0.6, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))Save plots
# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
yel_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Yellow",
Subject != 6106,
Subject != 6112) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
yel_tobind_noOutlier <- ctrl_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
yel_volcano_data_noOutlier <-
bind_cols(yel_volcano_data_noOutlier, yel_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = log2(FC_post_div_pre),
neglog10p = -log10(p))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
yel_pre_v_post_volcano_noOutlier <- yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & abs(log2_FC_post_div_pre) >= 0.6)Create feature lists with significant features along with their clusters
(yel_volcano_noOutlier <- yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = yel_sig_prepost_comp_clusters,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 0.6, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -0.6, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-6, 6)) +
labs(title = "Volcano Plot of Features Different in People After Control, Yellow Tomato Juice Consumption",
subtitle = "Yellow points are higher after control juice consumption when compared to prior to consumption.\nSubjects 6106 and 6112 removed",
caption = "Vertical dashed lines represent abs(log2 fold change) > 0.6, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))Save plots
Here, I want to venn significant features before I begin to look more into them. I am interested in the following effects: tomato effect, lycopene and/or soy isoflavones effect.
# keep only features present in both pre vs post red and pre vs post yellow
tomato_effect <- inner_join(red_sig_prepost_comp_clusters,
yel_sig_prepost_comp_clusters,
by = "mz_rt")
dim(tomato_effect)## [1] 12 25
Export venned list
# read in hits from red
red_mummichog_hits <- read_csv("../mummichog analysis/red-hilic-mixed mode results-integ/red-hilic-mummichog_matched_compound_all.csv") %>%
mutate(Query.Mass = round(Query.Mass, digits = 4)) %>% # keep 4 decimal points for mass
unite(mz_rt,
c("Query.Mass", "Retention.Time"),
sep = "_")Mummichog makes a list of hits for all of the compounds, so we only need to do an inner join for one of the lists. The outcome would be the same for all of the lists used since I have to input my whole dataset (no cutoffs) into analysis.
mummichog_mass_matches_tomato <- inner_join(tomato_effect,
red_mummichog_hits,
by = "mz_rt")
length(unique(mummichog_mass_matches_tomato$mz_rt)) # how many?## [1] 4
## # A tibble: 4 × 1
## `unique(mummichog_mass_matches_tomato$mz_rt)`
## <chr>
## 1 182.0811_5.07
## 2 255.0974_5.375
## 3 341.1704_5.083
## 4 86.06_5.38
# metabs with pval < 0.05 and fc >= 1.51
sigmetabs_tomato_effect <- as.character(tomato_effect$mz_rt)tomato_effect_df_for_stats_wide <- df_for_stats %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2) %>%
dplyr::select(c(1:11),
sigmetabs_tomato_effect)
# make tidy df
tomato_effect_df_for_stats_tidy <- tomato_effect_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
tomato_effect_df_for_stats_tidy$pre_post_intervention <- factor(tomato_effect_df_for_stats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(tomato_effect_df_for_stats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
tomato_effect_df_for_stats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), fill = "brown", linewidth = 0.2) +
facet_wrap(vars(mz_rt)) +
theme_bw() # combine sig features from post-red vs post-yellow
sig_postintervention_noOutlier <- full_join(postRed_sig_intrvntn_comp_clusters,
postYellow_sig_intrvntn_comp_clusters)
dim(sig_postintervention_noOutlier)## [1] 35 13
# select only sig features that are present when comparing pre-Red to post-Red
lyc_soy_effect <- inner_join(sig_postintervention_noOutlier,
red_sig_prepost_comp_clusters,
by = "mz_rt")
dim(lyc_soy_effect)## [1] 16 25
# remove features that are significant pre-Yellow vs post-Yellow comparison
# lyc_soy_effect <- anti_join(lyc_soy_effect,
# yel_sig_prepost_comp_clusters,
# by = "mz_rt")
# dim(lyc_soy_effect) # how many features were removed?Export venned list
Mummichog makes a list of hits for all of the compounds, so we only need to do an inner join for one of the lists. The outcome would be the same for all of the lists used since I have to input my whole dataset (no cutoffs) into analysis.
mummichog_mass_matches_lycsoy <- inner_join(lyc_soy_effect,
red_mummichog_hits,
by = "mz_rt")
length(unique(mummichog_mass_matches_lycsoy$mz_rt)) # how many?## [1] 3
## # A tibble: 3 × 1
## `unique(mummichog_mass_matches_lycsoy$mz_rt)`
## <chr>
## 1 469.0527_4.611
## 2 86.06_5.38
## 3 153.1273_3.887
lycsoy_effect_df_for_stats_wide <- df_for_stats %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2) %>%
dplyr::select(c(1:11),
sigmetabs_lycsoy_effect)
# make tidy df
lycsoy_effect_df_for_stats_tidy <- lycsoy_effect_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
lycsoy_effect_df_for_stats_tidy$pre_post_intervention <- factor(lycsoy_effect_df_for_stats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(lycsoy_effect_df_for_stats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
lycsoy_effect_df_for_stats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), fill = "brown", linewidth = 0.2) +
facet_wrap(vars(mz_rt), scales = "free_y") +
theme_clean() ## [1] 35 13
# select only sig features that are present when comparing pre-Yellow to post-Yellow
low_carot_tomato_effect <- inner_join(sig_postintervention_noOutlier,
yel_sig_prepost_comp_clusters,
by = "mz_rt")
dim(low_carot_tomato_effect)## [1] 12 25
Export venned list
mummichog_mass_matches_yellow_tomato <- inner_join(low_carot_tomato_effect,
red_mummichog_hits,
by = "mz_rt")
tibble(unique(mummichog_mass_matches_yellow_tomato$mz_rt)) # which features have a hit with mummichog?## # A tibble: 5 × 1
## `unique(mummichog_mass_matches_yellow_tomato$mz_rt)`
## <chr>
## 1 104.0706_5.342
## 2 146.0925_2.629
## 3 86.06_5.38
## 4 188.0705_0.471
## 5 312.2284_1.053
## [1] 5
# metabs with pval < 0.05 and fc >= 1.51
sigmetabs_yellow_effect <- as.character(low_carot_tomato_effect$mz_rt)yellow_effect_df_for_stats_wide <- df_for_stats %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2) %>%
dplyr::select(c(1:11),
sigmetabs_yellow_effect)
# make tidy df
yellow_effect_df_for_stats_tidy <- yellow_effect_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
yellow_effect_df_for_stats_tidy$pre_post_intervention <- factor(yellow_effect_df_for_stats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(yellow_effect_df_for_stats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
yellow_effect_df_for_stats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), fill = "brown", linewidth = 0.2) +
facet_wrap(vars(mz_rt), scales = "free_y") +
theme_clean() IL-5, IL-12p70, and GM-CSF were significantly decreased after tomato-soy intervention (only when comparing pre to post in Red) according to Wilcoxon rank-sum tests (p<0.05). Immune cell types were significantly altered in all 3 comparisons - so let’s see how these significant immuno outcomes correlate with metabolites found to be significant at p > 0.05 and FC >=1.61.
Import other outcomes (carotenoids and immunology data)
# load data
carot_immunology_meta <- read_excel("CompiledData_Results_Meta.xlsx",
sheet = "metadata_corrected_withsequence")
# clean up variable names
carot_immunology_meta <- clean_names(carot_immunology_meta)# convert variables that should be factors to factors
carot_immunology_meta <- carot_immunology_meta %>%
filter(intervention != "Baseline") %>%
mutate(across(.cols = c("patient_id", "period",
"intervention", "intervention_week",
"pre_post", "sex", "sequence"),
.fns = as.factor))
# some stuff came in as characters but should be numeric
carot_immunology_meta <- carot_immunology_meta %>%
mutate(across(.cols = c("il_2", "il_10", "il_13", "il_4"),
.fns = as.numeric))
# changing factor levels for pre_post
carot_immunology_meta$pre_post <- factor(carot_immunology_meta$pre_post,
levels = c("pre", "post"))
levels(carot_immunology_meta$pre_post) ## [1] "pre" "post"
# Calculate total_cis_lyc, total_lyc, and total_carotenoids
carot_immunology_meta <- carot_immunology_meta %>%
mutate(total_cis_lyc = other_cis_lyc + x5_cis_lyc,
total_lyc = all_trans_lyc + total_cis_lyc,
total_carotenoids = lutein + zeaxanthin + b_cryptoxanthin +
a_carotene + b_carotene + total_lyc)# go back to wide for stats df
df_for_stats_wide_noOutlier <- df_for_stats_noOutlier %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2)
# take outliers out of carot_immuno df
carot_immunology_meta_noOutlier <- carot_immunology_meta %>%
filter(patient_id != 6106,
patient_id != 6112)
# add sig immuno outcome columns to dataset
forcorr_metabslog2_immuno_noOutlier <- df_for_stats_wide_noOutlier %>%
mutate(il_5 = carot_immunology_meta_noOutlier$il_5,
il_12p70 = carot_immunology_meta_noOutlier$il_12p70,
gm_csf = carot_immunology_meta_noOutlier$gm_csf,
total_lyc = carot_immunology_meta_noOutlier$total_lyc,
# sig immune cells pre vs. post red
CD45ROpos_CD45RAneg_EMCD8 = carot_immunology_meta_noOutlier$x08_cd45ro_cd45ra_em_cd8,
CD19posCD3neg_Bcells = carot_immunology_meta_noOutlier$x25_cd19_cd3_b_cells,
CD27neg_IgDpos_naiveBcells = carot_immunology_meta_noOutlier$x26_cd27_ig_d_naive_b_cells,
# sig immune cells post-yellow vs. post red
CD16negNKcells = carot_immunology_meta_noOutlier$x38_cd16_nk_cells,
CD14posMDSCs = carot_immunology_meta_noOutlier$x40_cd14_mdsc_mono)Here, I am going to look at correlations between fold change differences and sig metabolites from the lycopene/soy effect list
No outliers
Subset df’s into pre and post
# pre
forcorr_pre_metabslog2_immuno_noOutlier <- forcorr_metabslog2_immuno_noOutlier %>%
dplyr::select(Subject, Intervention, pre_post, 12:ncol(.)) %>%
filter(pre_post == "pre")
# post
forcorr_post_metabslog2_immuno_noOutlier <- forcorr_metabslog2_immuno_noOutlier %>%
dplyr::select(Subject, Intervention, pre_post, 12:ncol(.)) %>%
filter(pre_post == "post")Create df with differences calculated for each outcome
These are correlations based on significant metabolites with FDR < 0.05 AND FC > 1.51 (from lyc/soy effect list) against immuno outcomes that were significant (padj < 0.05) when comparing pre to post red intervention.
# add subject and intervention back to df. Select only significant metabolites (pval > 0.05 and FC >=1.51) that are in lyc/soy effect list
lyc_soy_forcorr_diff_red_metabslog2_immuno_noOutlier <- forcorr_diff_red_metabslog2_immuno_noOutlier %>%
mutate(Subject = forcorr_post_metabslog2_immuno_noOutlier$Subject,
Intervention = forcorr_post_metabslog2_immuno_noOutlier$Intervention) %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, Intervention, il_5, il_12p70, gm_csf, total_lyc, CD45ROpos_CD45RAneg_EMCD8, CD19posCD3neg_Bcells, CD27neg_IgDpos_naiveBcells, CD16negNKcells, CD14posMDSCs, all_of(lyc_soy_effect$mz_rt))
# correlation table
lycsoy_red_corr_results_metabs_immuno_diff_noOutlier <- lyc_soy_forcorr_diff_red_metabslog2_immuno_noOutlier %>%
correlate(method = "spearman") %>%
rearrange() %>%
shave()
kable(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier, format = "markdown", digits = 3)| term | 540.4236_0.454 | il_12p70 | il_5 | CD45ROpos_CD45RAneg_EMCD8 | 229.1553_4.5 | gm_csf | CD14posMDSCs | CD16negNKcells | CD19posCD3neg_Bcells | 208.0968_0.848 | CD27neg_IgDpos_naiveBcells | 86.06_5.38 | 447.0919_4.001 | total_lyc | 433.2365_1.214 | 234.1136_0.825 | 433.1123_4.289 | 449.1074_3.962 | 259.0977_3.914 | 607.1285_8.059 | 303.1914_5.064 | 153.1273_3.887 | 431.0968_4.511 | 461.1072_4.293 | 469.0527_4.611 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 540.4236_0.454 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| il_12p70 | 0.527 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| il_5 | 0.685 | 0.648 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CD45ROpos_CD45RAneg_EMCD8 | 0.321 | 0.733 | 0.164 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 229.1553_4.5 | 0.661 | 0.139 | 0.406 | 0.091 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| gm_csf | 0.067 | 0.503 | -0.018 | 0.600 | 0.309 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CD14posMDSCs | 0.370 | -0.067 | 0.273 | -0.115 | 0.539 | 0.139 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CD16negNKcells | 0.430 | 0.406 | 0.648 | 0.030 | -0.164 | -0.321 | 0.297 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CD19posCD3neg_Bcells | -0.018 | 0.588 | 0.224 | 0.503 | -0.370 | 0.152 | -0.188 | 0.455 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 208.0968_0.848 | 0.188 | 0.467 | 0.515 | 0.176 | -0.200 | 0.139 | 0.394 | 0.782 | 0.321 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CD27neg_IgDpos_naiveBcells | -0.115 | 0.479 | 0.079 | 0.467 | -0.394 | 0.164 | -0.285 | 0.309 | 0.976 | 0.164 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 86.06_5.38 | -0.127 | -0.127 | -0.164 | 0.261 | 0.358 | 0.491 | 0.224 | -0.345 | -0.188 | -0.018 | -0.127 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 447.0919_4.001 | -0.018 | 0.394 | 0.042 | 0.176 | 0.042 | 0.648 | 0.236 | -0.115 | 0.067 | 0.285 | 0.006 | -0.200 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| total_lyc | -0.055 | -0.042 | 0.030 | -0.030 | -0.188 | 0.067 | 0.236 | 0.382 | 0.382 | 0.370 | 0.442 | 0.224 | -0.115 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 433.2365_1.214 | 0.091 | 0.406 | 0.164 | -0.030 | 0.079 | 0.406 | -0.370 | -0.164 | 0.067 | -0.091 | 0.103 | -0.273 | 0.503 | -0.103 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 234.1136_0.825 | -0.212 | -0.297 | -0.006 | -0.261 | 0.079 | -0.055 | -0.006 | -0.358 | -0.224 | -0.297 | -0.139 | 0.018 | 0.079 | 0.224 | 0.030 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 433.1123_4.289 | -0.358 | -0.309 | -0.152 | -0.418 | 0.188 | 0.188 | -0.285 | -0.612 | -0.503 | -0.394 | -0.394 | 0.333 | 0.042 | -0.127 | 0.503 | 0.430 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 449.1074_3.962 | -0.467 | -0.152 | -0.442 | -0.248 | -0.624 | 0.030 | 0.042 | 0.176 | 0.297 | 0.285 | 0.309 | -0.345 | 0.418 | 0.345 | 0.139 | -0.224 | -0.224 | NA | NA | NA | NA | NA | NA | NA | NA |
| 259.0977_3.914 | -0.309 | -0.394 | -0.042 | -0.370 | -0.491 | -0.818 | -0.479 | 0.200 | -0.018 | -0.103 | 0.006 | -0.152 | -0.794 | -0.006 | -0.370 | 0.030 | 0.030 | -0.164 | NA | NA | NA | NA | NA | NA | NA |
| 607.1285_8.059 | -0.661 | -0.067 | -0.261 | -0.212 | -0.358 | 0.188 | -0.600 | -0.479 | 0.127 | -0.309 | 0.236 | -0.067 | 0.261 | -0.030 | 0.600 | 0.406 | 0.685 | 0.212 | 0.067 | NA | NA | NA | NA | NA | NA |
| 303.1914_5.064 | -0.430 | -0.358 | -0.358 | -0.430 | -0.806 | -0.479 | -0.345 | 0.297 | 0.091 | 0.200 | 0.139 | -0.358 | -0.212 | 0.418 | -0.006 | -0.091 | -0.079 | 0.648 | 0.515 | 0.164 | NA | NA | NA | NA | NA |
| 153.1273_3.887 | -0.176 | -0.515 | -0.309 | -0.539 | -0.442 | -0.794 | -0.527 | 0.018 | -0.248 | -0.370 | -0.176 | -0.467 | -0.576 | -0.091 | -0.030 | -0.030 | 0.067 | 0.103 | 0.733 | 0.067 | 0.685 | NA | NA | NA | NA |
| 431.0968_4.511 | -0.685 | -0.358 | -0.697 | -0.127 | -0.539 | 0.091 | -0.297 | -0.467 | -0.333 | -0.079 | -0.309 | -0.006 | 0.309 | -0.370 | -0.018 | -0.091 | 0.200 | 0.430 | 0.030 | 0.321 | 0.333 | 0.164 | NA | NA | NA |
| 461.1072_4.293 | -0.515 | -0.467 | -0.564 | -0.539 | -0.455 | -0.067 | -0.030 | -0.224 | -0.176 | -0.103 | -0.103 | -0.394 | 0.442 | 0.188 | 0.261 | 0.273 | 0.176 | 0.782 | -0.152 | 0.406 | 0.576 | 0.285 | 0.527 | NA | NA |
| 469.0527_4.611 | -0.515 | -0.503 | -0.297 | -0.648 | -0.382 | -0.236 | 0.006 | -0.176 | -0.539 | 0.091 | -0.552 | -0.273 | 0.333 | -0.164 | 0.067 | 0.321 | 0.358 | 0.394 | 0.127 | 0.321 | 0.430 | 0.273 | 0.685 | 0.709 | NA |
Look for strongly correlated outcomes. R^2 >= to 0.5
lycsoy_red_sigcorr_metabsP05_FC1.6_immunodata <- lycsoy_red_corr_results_metabs_immuno_diff_noOutlier %>%
dplyr::select(term, il_5, il_12p70, gm_csf, total_lyc, CD45ROpos_CD45RAneg_EMCD8, CD19posCD3neg_Bcells, CD27neg_IgDpos_naiveBcells) %>%
filter(abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$il_5) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$il_12p70) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$gm_csf) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$total_lyc) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$CD45ROpos_CD45RAneg_EMCD8) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$CD19posCD3neg_Bcells) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$CD27neg_IgDpos_naiveBcells) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$CD16negNKcells) >= 0.5 |
abs(lycsoy_red_corr_results_metabs_immuno_diff_noOutlier$CD14posMDSCs) >= 0.5)
kable(lycsoy_red_sigcorr_metabsP05_FC1.6_immunodata, digits = 3, caption = "sig metabolites in lyc/soy effect list and sig immuno outcomes with strong correlations. R^2 >= to 0.5. Based on Pre vs. post-red log2fc")| term | il_5 | il_12p70 | gm_csf | total_lyc | CD45ROpos_CD45RAneg_EMCD8 | CD19posCD3neg_Bcells | CD27neg_IgDpos_naiveBcells |
|---|---|---|---|---|---|---|---|
| il_5 | NA | 0.648 | NA | NA | NA | NA | NA |
| CD45ROpos_CD45RAneg_EMCD8 | 0.164 | 0.733 | NA | NA | NA | NA | NA |
| gm_csf | -0.018 | 0.503 | NA | NA | 0.600 | NA | NA |
| CD16negNKcells | 0.648 | 0.406 | -0.321 | NA | 0.030 | NA | NA |
| CD19posCD3neg_Bcells | 0.224 | 0.588 | 0.152 | NA | 0.503 | NA | NA |
| 208.0968_0.848 | 0.515 | 0.467 | 0.139 | NA | 0.176 | 0.321 | NA |
| CD27neg_IgDpos_naiveBcells | 0.079 | 0.479 | 0.164 | NA | 0.467 | 0.976 | NA |
| 447.0919_4.001 | 0.042 | 0.394 | 0.648 | NA | 0.176 | 0.067 | 0.006 |
| 433.1123_4.289 | -0.152 | -0.309 | 0.188 | -0.127 | -0.418 | -0.503 | -0.394 |
| 259.0977_3.914 | -0.042 | -0.394 | -0.818 | -0.006 | -0.370 | -0.018 | 0.006 |
| 607.1285_8.059 | -0.261 | -0.067 | 0.188 | -0.030 | -0.212 | 0.127 | 0.236 |
| 153.1273_3.887 | -0.309 | -0.515 | -0.794 | -0.091 | -0.539 | -0.248 | -0.176 |
| 431.0968_4.511 | -0.697 | -0.358 | 0.091 | -0.370 | -0.127 | -0.333 | -0.309 |
| 461.1072_4.293 | -0.564 | -0.467 | -0.067 | 0.188 | -0.539 | -0.176 | -0.103 |
| 469.0527_4.611 | -0.297 | -0.503 | -0.236 | -0.164 | -0.648 | -0.539 | -0.552 |
lycsoy_red_ggcorr_noOutlier <- round(cor(lyc_soy_forcorr_diff_red_metabslog2_immuno_noOutlier[,-c(1:2)], method = "spearman"), digits = 2)
(lyc_soy_immuno_metabs_ggcorrplot <- lycsoy_red_ggcorr_noOutlier %>%
ggcorrplot(lab = TRUE,
outline.color = "white", type = "full", hc.order = FALSE,
ggtheme = ggthemes::theme_clean(base_size = 8, base_family = "sans"),
colors = c("#6D9EC1", "white", "#E46726")) +
theme(axis.text.x = element_text(angle = 90, size = 12),
axis.text.y = element_text(size = 12)))Export corplot